├── images ├── bipedal.gif ├── minitaur.gif └── ddpg_graph.png ├── writeup └── Deep_RL_Shallow_Updates_for_Deep_Reinforcement_Learning.pdf ├── play_ls_ddpg.py ├── README.md ├── utils ├── nn_agent_models.py ├── utils.py ├── Experience.py └── srl_algorithms.py ├── train_ls_ddpg.py └── ls_ddpg_main.py /images/bipedal.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/pytorch-ls-ddpg/master/images/bipedal.gif -------------------------------------------------------------------------------- /images/minitaur.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/pytorch-ls-ddpg/master/images/minitaur.gif -------------------------------------------------------------------------------- /images/ddpg_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/pytorch-ls-ddpg/master/images/ddpg_graph.png -------------------------------------------------------------------------------- /writeup/Deep_RL_Shallow_Updates_for_Deep_Reinforcement_Learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taldatech/pytorch-ls-ddpg/master/writeup/Deep_RL_Shallow_Updates_for_Deep_Reinforcement_Learning.pdf -------------------------------------------------------------------------------- /play_ls_ddpg.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | import pybullet_envs 4 | 5 | import utils.nn_agent_models as agent_model 6 | 7 | import numpy as np 8 | import torch 9 | 10 | # ENV_ID = "MinitaurBulletEnv-v0" 11 | ENV_ID = "BipedalWalker-v2" 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("-m", "--model", required=True, help="Model file to load") 17 | parser.add_argument("-e", "--env", default=ENV_ID, help="Environment name to use, default=" + ENV_ID) 18 | parser.add_argument("-r", "--record", help="If specified, sets the recording dir, default=Disabled") 19 | args = parser.parse_args() 20 | render = True 21 | spec = gym.envs.registry.spec(args.env) 22 | if spec._kwargs.get('render') and render: 23 | spec._kwargs['render'] = True 24 | env = gym.make(args.env) 25 | if args.record: 26 | env = gym.wrappers.Monitor(env, args.record) 27 | 28 | net = agent_model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]) 29 | net.load_state_dict(torch.load(args.model)) 30 | 31 | obs = env.reset() 32 | total_reward = 0.0 33 | total_steps = 0 34 | while True: 35 | obs_v = torch.FloatTensor([obs]) 36 | mu_v = net(obs_v) 37 | action = mu_v.squeeze(dim=0).data.numpy() 38 | action = np.clip(action, -1, 1) 39 | obs, reward, done, _ = env.step(action) 40 | total_reward += reward 41 | total_steps += 1 42 | if render: 43 | env.render() 44 | if done: 45 | env.close() 46 | break 47 | print("In %d steps we got %.3f reward" % (total_steps, total_reward)) 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pytorch-ls-ddpg 2 | PyTorch Implementation of Least-Squares Deep Deterministic Policy Gradients 3 | 4 | Based on the paper: 5 | 6 | Nir Levine, Tom Zahavy, Daniel J. Mankowitz, Aviv Tamar, Shie Mannor [Shallow Updates for Deep Reinforcement Learning](https://arxiv.org/abs/1705.07461), NIPS 2017 7 | 8 | Video: 9 | 10 | [YouTube](https://youtu.be/i8Cnas7QrMc) - https://youtu.be/i8Cnas7QrMc 11 | 12 | 13 | ![minitaur](https://github.com/taldatech/pytorch-ls-ddpg/blob/master/images/minitaur.gif) 14 | ![bipedal](https://github.com/taldatech/pytorch-ls-ddpg/blob/master/images/bipedal.gif) 15 | 16 | ![ddpg](https://github.com/taldatech/pytorch-ls-ddpg/blob/master/images/ddpg_graph.png) 17 | 18 | [LS-DQN](https://github.com/taldatech/pytorch-ls-dqn) - https://github.com/taldatech/pytorch-ls-dqn 19 | 20 | - [pytorch-ls-ddpg](#pytorch-ls-ddpg) 21 | * [Background](#background) 22 | * [Prerequisites](#prerequisites) 23 | * [Files in the repository](#files-in-the-repository) 24 | * [API (`ls_ddpg_main.py --help`)](#api---ls-ddpg-mainpy---help--) 25 | * [Playing](#playing) 26 | * [Training](#training) 27 | * [TensorBoard](#tensorboard) 28 | * [References](#references) 29 | 30 | ## Background 31 | The idea of this algorithm is to combine between Deep RL (DRL) to Shallow RL (SRL), where in this case, we use Deep Deterministic Policy Gradient (DDPG) as the DRL algorithm and 32 | Fitted Q-Iteration (FQI) and the Boosted version (B-FQI) as the SRL algorithm (which can be approximated using least-squares, full derivation is in the original paper). 33 | Every N_DRL (number of DDPG Critic backprop steps) we apply LS-UPDATE to the very last layer of the Critic NN, by using the complete Replay Buffer, a fetaure augmentation technique and 34 | Bayesian regularization (prevents overfitting, makes the LS matrix invertible) to solve the FQI equations. 35 | The assumptions are that the features extracted from the last layer form a rich representation, and that the large batch size used by the SRL algorithm improves stability and performance. 36 | For full derivations and theory, please refer to the original paper. 37 | 38 | ## Prerequisites 39 | |Library | Version | 40 | |----------------------|----| 41 | |`Python`| `3.5.5 (Anaconda)`| 42 | |`torch`| `0.4.1`| 43 | |`gym`| `0.10.9`| 44 | |`tensorboard`| `1.12.0`| 45 | |`tensorboardX`| `1.5`| 46 | |`pybullet`| `2.4.2`, https://pypi.org/project/pybullet/| 47 | |`Box2D`| `2.3.8`| 48 | 49 | ## Files in the repository 50 | 51 | |File name | Purpsoe | 52 | |----------------------|------| 53 | |`ls_ddpg_main.py`| general purpose main application for training/playing a LS-DDPG agent| 54 | |`play_ls_ddpg.py`| sample code for playing an environment, also in `ls_ddpg_main.py`| 55 | |`train_ls_ddpg.py`| sample code for training an environment, also in `ls_ddpg_main.py`| 56 | |`nn_agent_models.py`| agent and DDPG classes, holds the network, action selector and current state| 57 | |`Experience.py`| Replay Buffer classes| 58 | |`srl_algorithms.py`| Shallow RL algorithms, LS-UPDATE| 59 | |`utils.py`| utility functions| 60 | |`*.pth` / `*.dat`| Checkpoint files for the Agents (playing/continual learning)| 61 | |`Deep_RL_Shallow_Updates_for_Deep_Reinforcement_Learning.pdf`| Writeup - theory and results| 62 | 63 | ## API (`ls_ddpg_main.py --help`) 64 | 65 | 66 | You should use the `ls_ddpg_main.py` file with the following arguments: 67 | 68 | |Argument | Description | 69 | |-------------------------|---------------------------------------------| 70 | |-h, --help | shows arguments description | 71 | |-t, --train | train or continue training an agent | 72 | |-p, --play | play the environment using an a pretrained agent | 73 | |-n, --name | model name, for saving and loading | 74 | |-k, --lsddpg | use LS-DDPG (apply LS-UPDATE every N_DRL), default: false | 75 | |-j, --boosting| use Boosted-FQI as SRL algorithm, default: false | 76 | |-y, --path| path to agent checkpoint, for playing | 77 | |-e, --env| environment to play: MinitaurBulletEnv-v0, BipedalWalker-v2, default="BipedalWalker-v2" | 78 | |-d, --decay_rate| number of episodes for epsilon decaying, default: 500000 | 79 | |-o, --optimizer| optimizing algorithm ('RMSprop', 'Adam'), deafult: 'Adam' | 80 | |--lr_critic| learning rate for the Critic optimizer, default: 0.0001 | 81 | |--lr_actor| learning rate for the Actor optimizer, default: 0.0001 | 82 | |-g, --gamma| gamma parameter for the Q-Learning, default: 0.99 | 83 | |-l, --lam| regularization parameter value, default: 1, 10000 (boosting) | 84 | |-s, --buffer_size| Replay Buffer size, default: 1000000 | 85 | |-b, --batch_size| number of samples in each batch, default: 64 | 86 | |-i, --steps_to_start_learn| number of steps before the agents starts learning, default: 10000 | 87 | |-c, --test_iter| number of iterations between policy testing, default: 10000 | 88 | |-x, --record| Directory to store video recording when playing (only Linux) | 89 | |--no-visualize| if not typed, render the environment when playing | 90 | 91 | ## Playing 92 | Agents checkpoints (files ending with `.pth`) are saved and loaded from the `agent_ckpt` directory. 93 | Playing a pretrained agent for one episode: 94 | 95 | `python ls_ddpg_main.py --play -y ./saves/ddpg-agent_BipedalWalker-v2-LS-LAM-10000-100K-BOOSTING-SEED-2019-BATCH-64/best_+316.064_2410000.dat -x ./Videos/` 96 | 97 | ## Training 98 | 99 | Examples: 100 | 101 | * `python ls_ddpg_main.py --train --lsddpg -e MinitaurBulletEnv-v0 -l 1 -b 64` 102 | * `python ls_ddpg_main.py --train --lsddpg --boosting -e BipedalWalker-v2 -l 10000 -b 64` 103 | 104 | For full description of the flags, see the full API. 105 | 106 | ## TensorBoard 107 | 108 | TensorBoard logs are written dynamically during the runs, and it possible to observe the training progress using the graphs. In order to open TensoBoard, navigate to the source directory of the project and in the terminal/cmd: 109 | 110 | `tensorboard --logdir=./runs` 111 | 112 | * make sure you have the correct environment activated (`conda activate env-name`) and that you have `tensorboard`, `tensorboardX` installed. 113 | 114 | ## References 115 | * [PyTorch Agent Net: reinforcement learning toolkit for pytorch](https://github.com/Shmuma/ptan) by [Max Lapan](https://github.com/Shmuma) 116 | * Nir Levine, Tom Zahavy, Daniel J. Mankowitz, Aviv Tamar, Shie Mannor [Shallow Updates for Deep Reinforcement Learning](https://arxiv.org/abs/1705.07461), NIPS 2017 117 | 118 | 119 | -------------------------------------------------------------------------------- /utils/nn_agent_models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base models taken from the PTAN (PyTorch Agent Net) library by Shmuma 3 | https://github.com/Shmuma/ptan 4 | """ 5 | # imports 6 | import torch 7 | import torch.nn as nn 8 | import numpy as np 9 | import copy 10 | 11 | HID_SIZE = 128 12 | 13 | 14 | def float32_preprocessor(states): 15 | np_states = np.array(states, dtype=np.float32) 16 | return torch.tensor(np_states) 17 | 18 | 19 | class TargetNet: 20 | """ 21 | Wrapper around model which provides copy of it instead of trained weights 22 | """ 23 | 24 | def __init__(self, model): 25 | self.model = model 26 | self.target_model = copy.deepcopy(model) 27 | 28 | def sync(self): 29 | self.target_model.load_state_dict(self.model.state_dict()) 30 | 31 | def alpha_sync(self, alpha): 32 | """ 33 | Blend params of target net with params from the model 34 | :param alpha: 35 | """ 36 | assert isinstance(alpha, float) 37 | assert 0.0 < alpha <= 1.0 38 | state = self.model.state_dict() 39 | tgt_state = self.target_model.state_dict() 40 | for k, v in state.items(): 41 | tgt_state[k] = tgt_state[k] * alpha + (1 - alpha) * v 42 | self.target_model.load_state_dict(tgt_state) 43 | 44 | 45 | class BaseAgent: 46 | """ 47 | Abstract Agent interface 48 | """ 49 | 50 | def initial_state(self): 51 | """ 52 | Should create initial empty state for the agent. It will be called for the start of the episode 53 | :return: Anything agent want to remember 54 | """ 55 | return None 56 | 57 | def __call__(self, states, agent_states): 58 | """ 59 | Convert observations and states into actions to take 60 | :param states: list of environment states to process 61 | :param agent_states: list of states with the same length as observations 62 | :return: tuple of actions, states 63 | """ 64 | assert isinstance(states, list) 65 | assert isinstance(agent_states, list) 66 | assert len(agent_states) == len(states) 67 | 68 | raise NotImplementedError 69 | 70 | 71 | class DDPGActor(nn.Module): 72 | def __init__(self, obs_size, act_size): 73 | super(DDPGActor, self).__init__() 74 | 75 | self.fc1 = nn.Sequential(nn.Linear(obs_size, 400), 76 | nn.ReLU() 77 | ) 78 | # self.fc2 = nn.Sequential(nn.Linear(400, 300), 79 | # nn.ReLU() 80 | # ) 81 | self.fc2 = nn.Linear(400, 300) 82 | self.relu_2 = nn.ReLU() 83 | self.fc3 = nn.Linear(300, act_size) 84 | self.tanh_3 = nn.Tanh() 85 | # self.net = nn.Sequential( 86 | # nn.Linear(obs_size, 400), 87 | # nn.ReLU(), 88 | # nn.Linear(400, 300), 89 | # nn.ReLU(), 90 | # nn.Linear(300, act_size), 91 | # nn.Tanh() 92 | # ) 93 | 94 | def forward(self, x): 95 | return self.tanh_3(self.fc3(self.relu_2(self.fc2(self.fc1(x))))) 96 | 97 | def forward_to_last_hidden(self, x): 98 | return self.relu_2(self.fc2(self.fc1(x))) 99 | 100 | 101 | class DDPGCritic(nn.Module): 102 | def __init__(self, obs_size, act_size): 103 | super(DDPGCritic, self).__init__() 104 | 105 | self.obs_net = nn.Sequential( 106 | nn.Linear(obs_size, 400), 107 | nn.ReLU() 108 | ) 109 | 110 | self.out_fc1 = nn.Linear(400 + act_size, 300) 111 | self.relu_1 = nn.ReLU() 112 | 113 | self.out_fc2 = nn.Linear(300, 1) 114 | 115 | # self.out_net = nn.Sequential( 116 | # nn.Linear(400 + act_size, 300), 117 | # nn.ReLU(), 118 | # nn.Linear(300, 1) 119 | # ) 120 | 121 | def forward(self, x, a): 122 | obs = self.obs_net(x) 123 | return self.out_fc2(self.relu_1(self.out_fc1((torch.cat([obs, a], dim=1))))) 124 | 125 | def forward_to_last_hidden(self, x, a): 126 | obs = self.obs_net(x) 127 | return self.relu_1(self.out_fc1((torch.cat([obs, a], dim=1)))) 128 | 129 | 130 | class AgentDDPG(BaseAgent): 131 | """ 132 | Agent implementing Orstein-Uhlenbeck exploration process. 133 | # Implemented noise decaying for convergence 134 | """ 135 | 136 | def __init__(self, net, device="cpu", ou_enabled=True, ou_mu=0.0, ou_teta=0.15, ou_sigma=0.2, ou_epsilon=1.0, 137 | ou_decay_steps=500000, ou_epsilon_end=0.01, use_decaying_noise=True): 138 | self.net = net 139 | self.device = device 140 | self.ou_enabled = ou_enabled 141 | self.ou_mu = ou_mu 142 | self.ou_teta = ou_teta 143 | self.ou_sigma = ou_sigma 144 | self.ou_epsilon = ou_epsilon 145 | self.ou_decay_steps = ou_decay_steps 146 | self.ou_epsilon_end = ou_epsilon_end 147 | self.use_decaying_noise = use_decaying_noise 148 | self.num_agent_calls = 0 149 | 150 | def initial_state(self): 151 | return None 152 | 153 | def __call__(self, states, agent_states): 154 | states_v = float32_preprocessor(states).to(self.device) 155 | # we use the deterministic output of the actor as the expected value 156 | mu_v = self.net(states_v) 157 | actions = mu_v.data.cpu().numpy() 158 | 159 | if self.ou_enabled and self.ou_epsilon > 0: 160 | new_a_states = [] 161 | for a_state, action in zip(agent_states, actions): 162 | if a_state is None: 163 | # initialization of the OU process 164 | a_state = np.zeros(shape=action.shape, dtype=np.float32) 165 | a_state += self.ou_teta * (self.ou_mu - a_state) 166 | a_state += self.ou_sigma * np.random.normal(size=action.shape) 167 | if self.use_decaying_noise: 168 | epsilon = max(self.ou_epsilon_end, self.ou_epsilon - self.num_agent_calls / self.ou_decay_steps) 169 | else: 170 | epsilon = self.ou_epsilon 171 | action += epsilon * a_state 172 | new_a_states.append(a_state) 173 | else: 174 | new_a_states = agent_states 175 | 176 | self.num_agent_calls += 1 177 | actions = np.clip(actions, -1, 1) 178 | return actions, new_a_states 179 | -------------------------------------------------------------------------------- /train_ls_ddpg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import gym 4 | import pybullet_envs 5 | # import argparse 6 | from tensorboardX import SummaryWriter 7 | import numpy as np 8 | import utils.nn_agent_models as agent_model 9 | import utils.Experience as Experience 10 | import utils.utils as utils 11 | # import utils 12 | from utils.srl_algorithms import ls_step 13 | import torch 14 | import torch.optim as optim 15 | import torch.nn.functional as F 16 | import random 17 | from utils.utils import test_net 18 | 19 | # ENV_ID = "MinitaurBulletEnv-v0" 20 | ENV_ID = "BipedalWalker-v2" 21 | GAMMA = 0.99 22 | BATCH_SIZE = 64 23 | LEARNING_RATE_ACTOR = 0.0001 24 | LEARNING_RATE_CRITIC = 0.0001 25 | REPLAY_SIZE = 100000 26 | REPLAY_INITIAL = 10000 # 10000 27 | N_DRL = 100000 28 | N_SRL = REPLAY_SIZE 29 | REWARD_TO_SOLVE = 300 30 | 31 | 32 | TEST_ITERS = 10000 # 1000 for Minitaure 33 | 34 | 35 | if __name__ == "__main__": 36 | # parser = argparse.ArgumentParser() 37 | # parser.add_argument("--cuda", default=False, action='store_true', help='Enable CUDA') 38 | # parser.add_argument("-n", "--name", required=True, help="Name of the run") 39 | # args = parser.parse_args() 40 | training_random_seed = 2019 41 | use_constant_seed = True # to compare performance independently of the randomness 42 | use_ls_ddpg = True 43 | use_boosting = False 44 | lam = 10 # regularization parameter 45 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 46 | env = gym.make(ENV_ID) 47 | test_env = gym.make(ENV_ID) 48 | name = "agent_" + ENV_ID 49 | if use_ls_ddpg: 50 | print("using LS-DDPG") 51 | name += "-LS-LAM-" + str(lam) + "-" + str(int(1.0 * N_DRL / 1000)) + "K" 52 | if use_boosting: 53 | print("using boosting") 54 | name += "-BOOSTING" 55 | if use_constant_seed: 56 | name += "-SEED-" + str(training_random_seed) 57 | np.random.seed(training_random_seed) 58 | random.seed(training_random_seed) 59 | env.seed(training_random_seed) 60 | test_env.seed(training_random_seed) 61 | torch.manual_seed(training_random_seed) 62 | if torch.cuda.is_available(): 63 | torch.cuda.manual_seed_all(training_random_seed) 64 | print("training using constant seed of ", training_random_seed) 65 | name += "-BATCH-" + str(BATCH_SIZE) 66 | save_path = os.path.join("saves", "ddpg-" + name) 67 | os.makedirs(save_path, exist_ok=True) 68 | ckpt_save_path = './agent_ckpt/' + name + ".pth" 69 | if not os.path.exists('./agent_ckpt/'): 70 | os.makedirs('./agent_ckpt') 71 | 72 | act_net = agent_model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) 73 | crt_net = agent_model.DDPGCritic(env.observation_space.shape[0], env.action_space.shape[0]).to(device) 74 | print(act_net) 75 | print(crt_net) 76 | tgt_act_net = agent_model.TargetNet(act_net) 77 | tgt_crt_net = agent_model.TargetNet(crt_net) 78 | 79 | writer = SummaryWriter(comment="-ddpg-" + name) 80 | agent = agent_model.AgentDDPG(act_net, device=device) 81 | exp_source = Experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=1) 82 | buffer = Experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) 83 | act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE_ACTOR) 84 | crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE_CRITIC) 85 | 86 | utils.load_agent_state(act_net, crt_net, [act_opt, crt_opt], path=ckpt_save_path) 87 | 88 | frame_idx = 0 89 | drl_updates = 0 90 | best_reward = None 91 | with utils.RewardTracker(writer) as tracker: 92 | with utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: 93 | while True: 94 | frame_idx += 1 95 | buffer.populate(1) 96 | rewards_steps = exp_source.pop_rewards_steps() 97 | if rewards_steps: 98 | rewards, steps = zip(*rewards_steps) 99 | tb_tracker.track("episode_steps", steps[0], frame_idx) 100 | mean_reward = tracker.reward(rewards[0], frame_idx) 101 | if mean_reward is not None and mean_reward > REWARD_TO_SOLVE: 102 | print("environment solved in % steps" % frame_idx, " (% episodes)" % len(tracker.total_rewards)) 103 | utils.save_agent_state(act_net, crt_net, [act_opt, crt_opt], frame_idx, 104 | len(tracker.total_rewards), path=ckpt_save_path) 105 | break 106 | 107 | if len(buffer) < REPLAY_INITIAL: 108 | continue 109 | 110 | batch = buffer.sample(BATCH_SIZE) 111 | states_v, actions_v, rewards_v, dones_mask, last_states_v = utils.unpack_batch(batch, device) 112 | 113 | # train critic 114 | crt_opt.zero_grad() 115 | q_v = crt_net(states_v, actions_v) 116 | last_act_v = tgt_act_net.target_model(last_states_v) 117 | q_last_v = tgt_crt_net.target_model(last_states_v, last_act_v) 118 | q_last_v[dones_mask] = 0.0 119 | q_ref_v = rewards_v.unsqueeze(dim=-1) + q_last_v * GAMMA 120 | critic_loss_v = F.mse_loss(q_v, q_ref_v.detach()) 121 | critic_loss_v.backward() 122 | crt_opt.step() 123 | tb_tracker.track("loss_critic", critic_loss_v, frame_idx) 124 | tb_tracker.track("critic_ref", q_ref_v.mean(), frame_idx) 125 | 126 | drl_updates += 1 127 | # LS-UPDATE STEP for Critic (Q) 128 | if use_ls_ddpg and (drl_updates % N_DRL == 0) and (len(buffer) >= N_SRL): 129 | # if len(buffer) > 1: 130 | print("performing ls step...") 131 | batch = buffer.sample(N_SRL) 132 | ls_step([act_net, crt_net], [tgt_act_net, tgt_crt_net], batch, GAMMA, len(buffer), 133 | lam=lam, m_batch_size=256, device=device, use_boosting=use_boosting) 134 | 135 | # train actor 136 | act_opt.zero_grad() 137 | cur_actions_v = act_net(states_v) 138 | actor_loss_v = -crt_net(states_v, cur_actions_v) 139 | actor_loss_v = actor_loss_v.mean() 140 | actor_loss_v.backward() 141 | act_opt.step() 142 | tb_tracker.track("loss_actor", actor_loss_v, frame_idx) 143 | 144 | tgt_act_net.alpha_sync(alpha=1 - 1e-3) 145 | tgt_crt_net.alpha_sync(alpha=1 - 1e-3) 146 | 147 | if frame_idx % TEST_ITERS == 0: 148 | ts = time.time() 149 | rewards, steps = test_net(act_net, test_env, agent_model, device=device) 150 | print("Test done in %.2f sec, reward %.3f, steps %d" % ( 151 | time.time() - ts, rewards, steps)) 152 | writer.add_scalar("test_reward", rewards, frame_idx) 153 | writer.add_scalar("test_steps", steps, frame_idx) 154 | if best_reward is None or best_reward < rewards: 155 | if best_reward is not None: 156 | print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) 157 | name = "best_%+.3f_%d.dat" % (rewards, frame_idx) 158 | fname = os.path.join(save_path, name) 159 | torch.save(act_net.state_dict(), fname) 160 | utils.save_agent_state(act_net, crt_net, [act_opt, crt_opt], frame_idx, 161 | len(tracker.total_rewards), path=ckpt_save_path) 162 | best_reward = rewards 163 | 164 | pass 165 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions. 3 | Most taken from the PTAN (PyTorch Agent Net) library by Shmuma 4 | https://github.com/Shmuma/ptan 5 | """ 6 | 7 | # imports 8 | import numpy as np 9 | import torch 10 | import time 11 | import sys 12 | import collections 13 | import os 14 | from utils.nn_agent_models import float32_preprocessor 15 | 16 | 17 | def test_net(net, env, agent_model, count=10, device="cpu"): 18 | rewards = 0.0 19 | steps = 0 20 | for _ in range(count): 21 | obs = env.reset() 22 | while True: 23 | obs_v = agent_model.float32_preprocessor([obs]).to(device) 24 | mu_v = net(obs_v) 25 | action = mu_v.squeeze(dim=0).data.cpu().numpy() 26 | action = np.clip(action, -1, 1) 27 | obs, reward, done, _ = env.step(action) 28 | rewards += reward 29 | steps += 1 30 | if done: 31 | break 32 | return rewards / count, steps / count 33 | 34 | 35 | def save_agent_state(act_net, crt_net, optimizers, frame, games, save_replay=False, replay_buffer=None, name='', 36 | path=None): 37 | """ 38 | This function saves the current state of the NN (the weights) to a local file. 39 | :param act_net: the current actor NN (nn.Module) 40 | :param crt_net: the current critic NN (nn.Module) 41 | :param optimizers: the network's optimizer (torch.optim) 42 | :param frame: current frame number (int) 43 | :param games: total number of games seen (int) 44 | :param save_replay: whether or not to save the replay buffer (bool) 45 | :param replay_buffer: the replay buffer (list) 46 | :param name: specific name for the checkpoint (str) 47 | :param path: path to specific location where to save (str) 48 | """ 49 | dir_name = './agent_ckpt' 50 | if path: 51 | full_path = path 52 | else: 53 | if name: 54 | filename = "agent_ls_ddpg_" + name + ".pth" 55 | else: 56 | filename = "agent_ls_ddpg.pth" 57 | dir_name = './agent_ckpt' 58 | full_path = os.path.join(dir_name, filename) 59 | if not os.path.exists(dir_name): 60 | os.makedirs(dir_name) 61 | if save_replay and replay_buffer is not None: 62 | torch.save({ 63 | 'act_state_dict': act_net.state_dict(), 64 | 'crt_state_dict': crt_net.state_dict(), 65 | 'act_optimizer_state_dict': optimizers[0].state_dict(), 66 | 'crt_optimizer_state_dict': optimizers[1].state_dict(), 67 | 'frame_count': frame, 68 | 'games': games, 69 | 'replay_buffer': replay_buffer 70 | }, full_path) 71 | else: 72 | torch.save({ 73 | 'act_state_dict': act_net.state_dict(), 74 | 'crt_state_dict': crt_net.state_dict(), 75 | 'act_optimizer_state_dict': optimizers[0].state_dict(), 76 | 'crt_optimizer_state_dict': optimizers[1].state_dict(), 77 | 'frame_count': frame, 78 | 'games': games 79 | }, full_path) 80 | print("Saved Agent checkpoint @ ", full_path) 81 | 82 | 83 | def load_agent_state(act_net, crt_net, optimizers, path=None, copy_to_target_network=False, load_optimizer=True, 84 | target_nets=None, buffer=None, load_buffer=False): 85 | """ 86 | This function loads a state of the NN (the weights) from a local file. 87 | :param act_net: the current actor NN (nn.Module) 88 | :param crt_net: the current critic NN (nn.Module) 89 | :param optimizers: the network's optimizers (torch.optim) 90 | :param path: full path to checkpoint file (.pth) (str) 91 | :param copy_to_target_network: whether or not to copy the weights to target network (bool) 92 | :param load_optimizer: whether or not to load the optimizer state (bool) 93 | :param load_buffer: whether or not to load the replay buffer (bool) 94 | :param buffer: the replay buffer 95 | :param target_nets: the target NNs 96 | """ 97 | if path is None: 98 | raise SystemExit("path to model must be specified") 99 | else: 100 | full_path = path 101 | exists = os.path.isfile(full_path) 102 | if exists: 103 | if not torch.cuda.is_available(): 104 | checkpoint = torch.load(full_path, map_location='cpu') 105 | else: 106 | checkpoint = torch.load(full_path) 107 | act_net.load_state_dict(checkpoint['act_state_dict']) 108 | crt_net.load_state_dict(checkpoint['crt_state_dict']) 109 | if load_optimizer: 110 | optimizers[0].load_state_dict(checkpoint['act_optimizer_state_dict']) 111 | optimizers[1].load_state_dict(checkpoint['crt_optimizer_state_dict']) 112 | # self.steps_count = checkpoint['steps_count'] 113 | # self.episodes_seen = checkpoint['episodes_seen'] 114 | # selector.epsilon = checkpoint['epsilon'] 115 | # self.num_param_update = checkpoint['num_param_updates'] 116 | print("Checkpoint loaded successfully from ", full_path) 117 | # # for manual loading a checkpoint 118 | if copy_to_target_network and target_nets is not None: 119 | target_nets[0].sync() 120 | target_nets[1].sync() 121 | if load_buffer and buffer is not None: 122 | buffer.buffer = checkpoint['replay_buffer'] 123 | else: 124 | print("No checkpoint found...") 125 | 126 | 127 | def unpack_batch(batch, device="cpu"): 128 | states, actions, rewards, dones, last_states = [], [], [], [], [] 129 | for exp in batch: 130 | states.append(exp.state) 131 | actions.append(exp.action) 132 | rewards.append(exp.reward) 133 | dones.append(exp.last_state is None) 134 | if exp.last_state is None: 135 | last_states.append(exp.state) 136 | else: 137 | last_states.append(exp.last_state) 138 | states_v = float32_preprocessor(states).to(device) 139 | actions_v = float32_preprocessor(actions).to(device) 140 | rewards_v = float32_preprocessor(rewards).to(device) 141 | last_states_v = float32_preprocessor(last_states).to(device) 142 | dones_t = torch.ByteTensor(dones).to(device) 143 | return states_v, actions_v, rewards_v, dones_t, last_states_v 144 | 145 | 146 | class TBMeanTracker: 147 | """ 148 | TensorBoard value tracker: allows to batch fixed amount of historical values and write their mean into TB 149 | 150 | Designed and tested with pytorch-tensorboard in mind 151 | """ 152 | def __init__(self, writer, batch_size): 153 | """ 154 | :param writer: writer with close() and add_scalar() methods 155 | :param batch_size: integer size of batch to track 156 | """ 157 | assert isinstance(batch_size, int) 158 | assert writer is not None 159 | self.writer = writer 160 | self.batch_size = batch_size 161 | 162 | def __enter__(self): 163 | self._batches = collections.defaultdict(list) 164 | return self 165 | 166 | def __exit__(self, exc_type, exc_val, exc_tb): 167 | self.writer.close() 168 | 169 | @staticmethod 170 | def _as_float(value): 171 | assert isinstance(value, (float, int, np.ndarray, np.generic, torch.autograd.Variable)) or torch.is_tensor(value) 172 | tensor_val = None 173 | if isinstance(value, torch.autograd.Variable): 174 | tensor_val = value.data 175 | elif torch.is_tensor(value): 176 | tensor_val = value 177 | 178 | if tensor_val is not None: 179 | return tensor_val.float().mean() 180 | elif isinstance(value, np.ndarray): 181 | return float(np.mean(value)) 182 | else: 183 | return float(value) 184 | 185 | def track(self, param_name, value, iter_index): 186 | assert isinstance(param_name, str) 187 | assert isinstance(iter_index, int) 188 | 189 | data = self._batches[param_name] 190 | data.append(self._as_float(value)) 191 | 192 | if len(data) >= self.batch_size: 193 | self.writer.add_scalar(param_name, np.mean(data), iter_index) 194 | data.clear() 195 | 196 | 197 | class RewardTracker: 198 | def __init__(self, writer, min_ts_diff=1.0): 199 | """ 200 | Constructs RewardTracker 201 | :param writer: writer to use for writing stats 202 | :param min_ts_diff: minimal time difference to track speed 203 | """ 204 | self.writer = writer 205 | self.min_ts_diff = min_ts_diff 206 | 207 | def __enter__(self): 208 | self.ts = time.time() 209 | self.ts_frame = 0 210 | self.total_rewards = [] 211 | return self 212 | 213 | def __exit__(self, *args): 214 | self.writer.close() 215 | 216 | def reward(self, reward, frame, epsilon=None): 217 | self.total_rewards.append(reward) 218 | mean_reward = np.mean(self.total_rewards[-100:]) 219 | ts_diff = time.time() - self.ts 220 | if ts_diff > self.min_ts_diff: 221 | speed = (frame - self.ts_frame) / ts_diff 222 | self.ts_frame = frame 223 | self.ts = time.time() 224 | epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon 225 | print("%d: done %d episodes, mean reward %.3f, speed %.2f f/s%s" % ( 226 | frame, len(self.total_rewards), mean_reward, speed, epsilon_str 227 | )) 228 | sys.stdout.flush() 229 | self.writer.add_scalar("speed", speed, frame) 230 | if epsilon is not None: 231 | self.writer.add_scalar("epsilon", epsilon, frame) 232 | self.writer.add_scalar("reward_100", mean_reward, frame) 233 | self.writer.add_scalar("reward", reward, frame) 234 | return mean_reward if len(self.total_rewards) > 30 else None 235 | 236 | -------------------------------------------------------------------------------- /utils/Experience.py: -------------------------------------------------------------------------------- 1 | """ 2 | Experience Source. 3 | Base models taken from the PTAN (PyTorch Agent Net) library by Shmuma 4 | https://github.com/Shmuma/ptan 5 | """ 6 | 7 | import gym 8 | import collections 9 | 10 | 11 | import numpy as np 12 | 13 | from collections import namedtuple, deque 14 | 15 | from utils.nn_agent_models import BaseAgent 16 | 17 | # one single experience step 18 | Experience = namedtuple('Experience', ['state', 'action', 'reward', 'done']) 19 | 20 | 21 | class ExperienceSource: 22 | """ 23 | Simple n-step experience source using single or multiple environments 24 | 25 | Every experience contains n list of Experience entries 26 | """ 27 | def __init__(self, env, agent, steps_count=2, steps_delta=1, vectorized=False): 28 | """ 29 | Create simple experience source 30 | :param env: environment or list of environments to be used 31 | :param agent: callable to convert batch of states into actions to take 32 | :param steps_count: count of steps to track for every experience chain 33 | :param steps_delta: how many steps to do between experience items 34 | :param vectorized: support of vectorized envs from OpenAI universe 35 | """ 36 | assert isinstance(env, (gym.Env, list, tuple)) 37 | assert isinstance(agent, BaseAgent) 38 | assert isinstance(steps_count, int) 39 | assert steps_count >= 1 40 | assert isinstance(vectorized, bool) 41 | if isinstance(env, (list, tuple)): 42 | self.pool = env 43 | else: 44 | self.pool = [env] 45 | self.agent = agent 46 | self.steps_count = steps_count 47 | self.steps_delta = steps_delta 48 | self.total_rewards = [] 49 | self.total_steps = [] 50 | self.vectorized = vectorized 51 | 52 | def __iter__(self): 53 | states, agent_states, histories, cur_rewards, cur_steps = [], [], [], [], [] 54 | env_lens = [] 55 | for env in self.pool: 56 | obs = env.reset() 57 | # if the environment is vectorized, all it's output is lists of results. 58 | # Details are here: https://github.com/openai/universe/blob/master/doc/env_semantics.rst 59 | if self.vectorized: 60 | obs_len = len(obs) 61 | states.extend(obs) 62 | else: 63 | obs_len = 1 64 | states.append(obs) 65 | env_lens.append(obs_len) 66 | 67 | for _ in range(obs_len): 68 | histories.append(deque(maxlen=self.steps_count)) 69 | cur_rewards.append(0.0) 70 | cur_steps.append(0) 71 | agent_states.append(self.agent.initial_state()) 72 | 73 | iter_idx = 0 74 | while True: 75 | actions = [None] * len(states) 76 | states_input = [] 77 | states_indices = [] 78 | for idx, state in enumerate(states): 79 | if state is None: 80 | actions[idx] = self.pool[0].action_space.sample() # assume that all envs are from the same family 81 | else: 82 | states_input.append(state) 83 | states_indices.append(idx) 84 | if states_input: 85 | states_actions, new_agent_states = self.agent(states_input, agent_states) 86 | for idx, action in enumerate(states_actions): 87 | g_idx = states_indices[idx] 88 | actions[g_idx] = action 89 | agent_states[g_idx] = new_agent_states[idx] 90 | grouped_actions = _group_list(actions, env_lens) 91 | 92 | global_ofs = 0 93 | for env_idx, (env, action_n) in enumerate(zip(self.pool, grouped_actions)): 94 | if self.vectorized: 95 | next_state_n, r_n, is_done_n, _ = env.step(action_n) 96 | else: 97 | next_state, r, is_done, _ = env.step(action_n[0]) 98 | next_state_n, r_n, is_done_n = [next_state], [r], [is_done] 99 | 100 | for ofs, (action, next_state, r, is_done) in enumerate(zip(action_n, next_state_n, r_n, is_done_n)): 101 | idx = global_ofs + ofs 102 | state = states[idx] 103 | history = histories[idx] 104 | 105 | cur_rewards[idx] += r 106 | cur_steps[idx] += 1 107 | if state is not None: 108 | history.append(Experience(state=state, action=action, reward=r, done=is_done)) 109 | if len(history) == self.steps_count and iter_idx % self.steps_delta == 0: 110 | yield tuple(history) 111 | states[idx] = next_state 112 | if is_done: 113 | # generate tail of history 114 | while len(history) >= 1: 115 | yield tuple(history) 116 | history.popleft() 117 | self.total_rewards.append(cur_rewards[idx]) 118 | self.total_steps.append(cur_steps[idx]) 119 | cur_rewards[idx] = 0.0 120 | cur_steps[idx] = 0 121 | # vectorized envs are reset automatically 122 | states[idx] = env.reset() if not self.vectorized else None 123 | agent_states[idx] = self.agent.initial_state() 124 | history.clear() 125 | global_ofs += len(action_n) 126 | iter_idx += 1 127 | 128 | def pop_total_rewards(self): 129 | r = self.total_rewards 130 | if r: 131 | self.total_rewards = [] 132 | self.total_steps = [] 133 | return r 134 | 135 | def pop_rewards_steps(self): 136 | res = list(zip(self.total_rewards, self.total_steps)) 137 | if res: 138 | self.total_rewards, self.total_steps = [], [] 139 | return res 140 | 141 | 142 | def _group_list(items, lens): 143 | """ 144 | Unflat the list of items by lens 145 | :param items: list of items 146 | :param lens: list of integers 147 | :return: list of list of items grouped by lengths 148 | """ 149 | res = [] 150 | cur_ofs = 0 151 | for g_len in lens: 152 | res.append(items[cur_ofs:cur_ofs+g_len]) 153 | cur_ofs += g_len 154 | return res 155 | 156 | 157 | # those entries are emitted from ExperienceSourceFirstLast. Reward is discounted over the trajectory piece 158 | ExperienceFirstLast = collections.namedtuple('ExperienceFirstLast', ('state', 'action', 'reward', 'last_state')) 159 | 160 | 161 | class ExperienceSourceFirstLast(ExperienceSource): 162 | """ 163 | This is a wrapper around ExperienceSource to prevent storing full trajectory in replay buffer when we need 164 | only first and last states. For every trajectory piece it calculates discounted reward and emits only first 165 | and last states and action taken in the first state. 166 | 167 | If we have partial trajectory at the end of episode, last_state will be None 168 | """ 169 | def __init__(self, env, agent, gamma, steps_count=1, steps_delta=1, vectorized=False): 170 | assert isinstance(gamma, float) 171 | super(ExperienceSourceFirstLast, self).__init__(env, agent, steps_count+1, steps_delta, vectorized=vectorized) 172 | self.gamma = gamma 173 | self.steps = steps_count 174 | 175 | def __iter__(self): 176 | for exp in super(ExperienceSourceFirstLast, self).__iter__(): 177 | if exp[-1].done and len(exp) <= self.steps: 178 | last_state = None 179 | elems = exp 180 | else: 181 | last_state = exp[-1].state 182 | elems = exp[:-1] 183 | total_reward = 0.0 184 | for e in reversed(elems): 185 | total_reward *= self.gamma 186 | total_reward += e.reward 187 | yield ExperienceFirstLast(state=exp[0].state, action=exp[0].action, 188 | reward=total_reward, last_state=last_state) 189 | 190 | 191 | class ExperienceReplayBuffer: 192 | def __init__(self, experience_source, buffer_size): 193 | assert isinstance(experience_source, (ExperienceSource, type(None))) 194 | assert isinstance(buffer_size, int) 195 | self.experience_source_iter = None if experience_source is None else iter(experience_source) 196 | self.buffer = [] 197 | self.capacity = buffer_size 198 | self.pos = 0 199 | 200 | def __len__(self): 201 | return len(self.buffer) 202 | 203 | def __iter__(self): 204 | return iter(self.buffer) 205 | 206 | def sample(self, batch_size): 207 | """ 208 | Get one random batch from experience replay 209 | TODO: implement sampling order policy 210 | :param batch_size: 211 | :return: 212 | """ 213 | if len(self.buffer) <= batch_size: 214 | return self.buffer 215 | # Warning: replace=False makes random.choice O(n) 216 | keys = np.random.choice(len(self.buffer), batch_size, replace=True) 217 | return [self.buffer[key] for key in keys] 218 | 219 | def _add(self, sample): 220 | if len(self.buffer) < self.capacity: 221 | self.buffer.append(sample) 222 | else: 223 | self.buffer[self.pos] = sample 224 | self.pos = (self.pos + 1) % self.capacity 225 | 226 | def populate(self, samples): 227 | """ 228 | Populates samples into the buffer 229 | :param samples: how many samples to populate 230 | """ 231 | for _ in range(samples): 232 | entry = next(self.experience_source_iter) 233 | self._add(entry) 234 | -------------------------------------------------------------------------------- /utils/srl_algorithms.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file implements the SRL algorithms. 3 | Author: Tal Daniel 4 | """ 5 | 6 | # imports 7 | import torch 8 | import utils.utils as utils 9 | import copy 10 | 11 | 12 | def calc_fqi_matrices(nets, tgt_nets, batch, gamma, n_srl, m_batch_size=512, device='cpu', 13 | use_boosting=False, train_actor=False): 14 | """ 15 | This function calculates A and b tensors for the FQI solution. 16 | :param batch: batch of samples to extract features from (list) 17 | :param nets: networks to extract features from (nn.Module) 18 | :param tgt_nets: target networks from which Q values of next states are calculated (nn.Module) 19 | :param gamma: discount factor (float) 20 | :param n_srl: number of samples to include in the FQI solution 21 | :param m_batch_size: number of samples to calculate simultaneously (int) 22 | :param device: on which device to perform the calculation (cpu/gpu) 23 | :param use_boosting: whether or not to use Boosted FQI 24 | :param: train_actor: whether or not to train actor net (bool) 25 | :return: A, A_bias, b, b_bias parameters for calculating the LS (np.arrays) 26 | """ 27 | num_batches = n_srl // m_batch_size 28 | act_net, crt_net = nets 29 | tgt_act_net, tgt_crt_net = tgt_nets 30 | dim_act = act_net.fc2.out_features 31 | dim_crt = crt_net.out_fc1.out_features 32 | num_actions = act_net.fc3.out_features 33 | 34 | if train_actor: 35 | A_act = torch.zeros([dim_act * num_actions, dim_act * num_actions], dtype=torch.float32).to(device) 36 | A_act_bias = torch.zeros([1 * num_actions, 1 * num_actions], dtype=torch.float32).to(device) 37 | b_act = torch.zeros([dim_act * num_actions, 1], dtype=torch.float32).to(device) 38 | b_act_bias = torch.zeros([1 * num_actions, 1], dtype=torch.float32).to(device) 39 | 40 | A_crt = torch.zeros([dim_crt * 1, dim_crt * 1], dtype=torch.float32).to(device) 41 | A_crt_bias = torch.zeros([1 * 1, 1 * 1], dtype=torch.float32).to(device) 42 | b_crt = torch.zeros([dim_crt * 1, 1], dtype=torch.float32).to(device) 43 | b_crt_bias = torch.zeros([1 * 1, 1], dtype=torch.float32).to(device) 44 | 45 | for i in range(num_batches): 46 | idx = i * m_batch_size 47 | if i == num_batches - 1: 48 | states_v, actions_v, rewards_v, dones_mask, last_states_v = utils.unpack_batch(batch[idx:], device) 49 | else: 50 | states_v, actions_v, rewards_v, dones_mask, last_states_v = utils.unpack_batch( 51 | batch[idx: idx + m_batch_size], device) 52 | if train_actor: 53 | states_features_act = act_net.forward_to_last_hidden(states_v) 54 | states_features_crt = crt_net.forward_to_last_hidden(states_v, actions_v) 55 | 56 | # augmentation 57 | states_features_crt_bias = torch.ones([states_features_crt.shape[0], 1 * 1], 58 | dtype=torch.float32).to(device) 59 | if train_actor: 60 | states_features_act_aug = states_features_act.detach().repeat( 61 | (1, num_actions)).to( 62 | device) 63 | states_features_act_bias_aug = torch.ones([states_features_act.shape[0], 1 * num_actions], 64 | dtype=torch.float32).to(device) 65 | 66 | states_features_act_mat = torch.mm(torch.t(states_features_act_aug), states_features_act_aug) 67 | states_features_act_bias_mat = torch.mm(torch.t(states_features_act_bias_aug), states_features_act_bias_aug) 68 | 69 | states_features_crt_mat = torch.mm(torch.t(states_features_crt.detach()), states_features_crt.detach()) 70 | states_features_crt_bias_mat = torch.mm(torch.t(states_features_crt_bias), states_features_crt_bias) 71 | 72 | q_v = crt_net(states_v, actions_v) 73 | last_act_v = tgt_act_net.target_model(last_states_v) 74 | q_last_v = tgt_crt_net.target_model(last_states_v, last_act_v) 75 | q_last_v[dones_mask] = 0.0 76 | q_ref_v = rewards_v.unsqueeze(dim=-1) + q_last_v * gamma # y_i 77 | 78 | if use_boosting: 79 | # calculate truncated bellman error 80 | bellman_error = q_ref_v.detach() - q_v.detach() 81 | truncated_bellman_error = bellman_error.clamp(-1, 1) 82 | 83 | if train_actor: 84 | b_act += torch.mm(torch.t(states_features_act_aug.detach()), 85 | truncated_bellman_error.detach().view(-1, 1)) 86 | b_act_bias += torch.mm(torch.t(states_features_act_bias_aug), 87 | truncated_bellman_error.detach().view(-1, 1)) 88 | 89 | b_crt += torch.mm(torch.t(states_features_crt.detach()), 90 | truncated_bellman_error.detach().view(-1, 1)) 91 | b_crt_bias += torch.mm(torch.t(states_features_crt_bias), 92 | truncated_bellman_error.detach().view(-1, 1)) 93 | else: 94 | if train_actor: 95 | b_act += torch.mm(torch.t(states_features_act_aug.detach()), 96 | q_ref_v.detach().view(-1, 1)) 97 | b_act_bias += torch.mm(torch.t(states_features_act_bias_aug), 98 | q_ref_v.detach().view(-1, 1)) 99 | 100 | b_crt += torch.mm(torch.t(states_features_crt.detach()), 101 | q_ref_v.detach().view(-1, 1)) 102 | b_crt_bias += torch.mm(torch.t(states_features_crt_bias), 103 | q_ref_v.detach().view(-1, 1)) 104 | if train_actor: 105 | A_act += states_features_act_mat.detach() 106 | A_act_bias += states_features_act_bias_mat 107 | A_crt += states_features_crt_mat.detach() 108 | A_crt_bias += states_features_crt_bias_mat 109 | if train_actor: 110 | A_act = (1.0 / n_srl) * A_act 111 | A_act_bias = (1.0 / n_srl) * A_act_bias 112 | b_act = (1.0 / n_srl) * b_act 113 | b_act_bias = (1.0 / n_srl) * b_act_bias 114 | 115 | A_crt = (1.0 / n_srl) * A_crt 116 | A_crt_bias = (1.0 / n_srl) * A_crt_bias 117 | b_crt = (1.0 / n_srl) * b_crt 118 | b_crt_bias = (1.0 / n_srl) * b_crt_bias 119 | 120 | if train_actor: 121 | return A_act, A_act_bias, b_act, b_act_bias, A_crt, A_crt_bias, b_crt, b_crt_bias 122 | else: 123 | return A_crt, A_crt_bias, b_crt, b_crt_bias 124 | 125 | 126 | def calc_fqi_w_srl(a, a_bias, b, b_bias, w, w_b, lam=1.0, device='cpu'): 127 | """ 128 | This function calculates the closed-form solution of the DQI algorithm. 129 | :param a: A matrix built from features (np.array) 130 | :param a_bias: same, but for bias 131 | :param b: b vector built from features and rewards (np.array) 132 | :param b_bias: same, but for bias 133 | :param w: weights of the last hidden layer in the DQN (np.array) 134 | :param w_b: bias weights 135 | :param lam: regularization parameter for the Least-Square (float) 136 | :param device: on which device to perform the calculation (cpu/gpu) 137 | :return: w_srl: retrained weights using FQI closed-form solution (np.array) 138 | """ 139 | num_actions = w.shape[0] 140 | dim = w.shape[1] 141 | w = w.view(-1, 1) 142 | w_b = w_b.view(-1, 1) 143 | w_srl = torch.mm(torch.inverse(a + lam * torch.eye(num_actions * dim).to(device)), b + lam * w.detach()) 144 | w_b_srl = torch.mm(torch.inverse(a_bias + lam * torch.eye(num_actions * 1).to(device)), b_bias + lam * w_b.detach()) 145 | return w_srl.view(num_actions, dim), w_b_srl.squeeze() 146 | 147 | 148 | def ls_step(nets, tgt_nets, batch, gamma, n_srl, lam=1.0, m_batch_size=256, device='cpu', use_boosting=False, 149 | sync_tgt=False): 150 | """ 151 | This function performs the least-squares update on the last hidden layer weights. 152 | :param batch: batch of samples to extract features from (list) 153 | :param nets: networks to extract features from (nn.Module) 154 | :param tgt_nets: target networks from which Q values of next states are calculated (nn.Module) 155 | :param gamma: discount factor (float) 156 | :param n_srl: number of samples to include in the FQI solution 157 | :param lam: regularization parameter for the Least-Square (float) 158 | :param m_batch_size: number of samples to calculate simultaneously (int) 159 | :param device: on which device to perform the calculation (cpu/gpu) 160 | :param use_boosting: whether or not to use Boosted FQI 161 | :param sync_tgt: whether or not to sync target networks (bool) 162 | :return: 163 | """ 164 | train_actor = False 165 | act_net, crt_net = nets 166 | tgt_act_net, tgt_crt_net = tgt_nets 167 | a_b_s = calc_fqi_matrices(nets, tgt_nets, batch, gamma, 168 | n_srl, m_batch_size=m_batch_size, device=device, use_boosting=use_boosting, 169 | train_actor=train_actor 170 | ) 171 | if train_actor: 172 | a_act, a_act_bias, b_act, b_act_bias, a_crt, a_crt_bias, b_crt, b_crt_bias = a_b_s 173 | else: 174 | a_crt, a_crt_bias, b_crt, b_crt_bias = a_b_s 175 | 176 | if train_actor: 177 | w_act_last_dict = copy.deepcopy(act_net.fc3.state_dict()) 178 | w_act_last_dict_before = copy.deepcopy(act_net.fc3.state_dict()) 179 | w_act_srl, w_b_act_srl = calc_fqi_w_srl(a_act.detach(), a_act_bias.detach(), b_act.detach(), 180 | b_act_bias.detach(), 181 | w_act_last_dict['weight'], w_act_last_dict['bias'], lam=lam, 182 | device=device) 183 | 184 | w_crt_last_dict = copy.deepcopy(crt_net.out_fc2.state_dict()) 185 | w_crt_last_dict_before = copy.deepcopy(crt_net.out_fc2.state_dict()) 186 | w_crt_srl, w_b_crt_srl = calc_fqi_w_srl(a_crt.detach(), a_crt_bias.detach(), b_crt.detach(), 187 | b_crt_bias.detach(), 188 | w_crt_last_dict['weight'], w_crt_last_dict['bias'], lam=lam, 189 | device=device) 190 | 191 | if train_actor: 192 | w_act_last_dict['weight'] = w_act_srl.detach() 193 | w_act_last_dict['bias'] = w_b_act_srl.detach() 194 | act_net.fc3.load_state_dict(w_act_last_dict) 195 | 196 | weight_diff_act = torch.sum((w_act_last_dict['weight'] - w_act_last_dict_before['weight']) ** 2) 197 | bias_diff_act = torch.sum((w_act_last_dict['bias'] - w_act_last_dict_before['bias']) ** 2) 198 | total_weight_diff_act = torch.sqrt(weight_diff_act + bias_diff_act) 199 | 200 | w_crt_last_dict['weight'] = w_crt_srl.detach() 201 | w_crt_last_dict['bias'] = w_b_crt_srl.detach().unsqueeze(-1) 202 | crt_net.out_fc2.load_state_dict(w_crt_last_dict) 203 | # weight diff 204 | weight_diff_crt = torch.sum((w_crt_last_dict['weight'] - w_crt_last_dict_before['weight']) ** 2) 205 | bias_diff_crt = torch.sum((w_crt_last_dict['bias'] - w_crt_last_dict_before['bias']) ** 2) 206 | total_weight_diff_crt = torch.sqrt(weight_diff_crt + bias_diff_crt) 207 | 208 | if sync_tgt: 209 | if train_actor: 210 | tgt_act_net.alpha_sync(alpha=1 - 1e-3) 211 | tgt_crt_net.alpha_sync(alpha=1 - 1e-3) 212 | 213 | if train_actor: 214 | print("total weight difference of ls-update:: actor: %.3f" % total_weight_diff_act.item(), 215 | " critic: %.3f" % total_weight_diff_crt.item()) 216 | else: 217 | print("total weight difference of ls-update:: critic: %.3f" % total_weight_diff_crt.item()) 218 | print("least-squares step done.") 219 | -------------------------------------------------------------------------------- /ls_ddpg_main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main application for Least-Squares Deep Deterministic Policy Gradients 3 | """ 4 | 5 | import os 6 | import time 7 | import gym 8 | import pybullet_envs 9 | from tensorboardX import SummaryWriter 10 | import numpy as np 11 | import utils.nn_agent_models as agent_model 12 | import utils.Experience as Experience 13 | import utils.utils as utils 14 | from utils.srl_algorithms import ls_step 15 | import torch 16 | import torch.optim as optim 17 | import torch.nn.functional as F 18 | import random 19 | from utils.utils import test_net 20 | import argparse 21 | 22 | REWARD_TO_SOLVE = 300 # mean reward the environment is considered SOLVED 23 | 24 | if __name__ == "__main__": 25 | parser = argparse.ArgumentParser(description="train and play an LS-DQN agent") 26 | # modes 27 | parser.add_argument("-t", "--train", help="train or continue training an agent", 28 | action="store_true") 29 | parser.add_argument("-k", "--lsddpg", help="use LS-DDPG", 30 | action="store_true") 31 | parser.add_argument("-j", "--boosting", help="use boosting", 32 | action="store_true") 33 | parser.add_argument("-p", "--play", help="play the environment using an a pretrained agent", 34 | action="store_true") 35 | parser.add_argument("-y", "--path", type=str, help="path to agent checkpoint, for playing") 36 | # arguments 37 | # for training and playing 38 | parser.add_argument("-n", "--name", type=str, 39 | help="model name, for saving and loading," 40 | " if not set, training will continue from a pretrained checkpoint") 41 | parser.add_argument("-e", "--env", type=str, 42 | help="environment to play: MinitaurBulletEnv-v0, BipedalWalker-v2", default="BipedalWalker-v2") 43 | # for training 44 | parser.add_argument("-d", "--decay_rate", type=int, 45 | help="number of episodes for epsilon decaying, default: 500000") 46 | parser.add_argument("-o", "--optimizer", type=str, 47 | help="optimizing algorithm ('RMSprop', 'Adam'), deafult: 'Adam'") 48 | parser.add_argument("--lr_actor", type=float, 49 | help="learning rate for the Actor optimizer, default: 0.0001") 50 | parser.add_argument("--lr_critic", type=float, 51 | help="learning rate for the Critic optimizer, default: 0.0001") 52 | parser.add_argument("-l", "--lam", type=float, 53 | help="regularization parameter value, default: 1, 10000 (boosting)") 54 | parser.add_argument("-g", "--gamma", type=float, 55 | help="gamma parameter for the Q-Learning, default: 0.99") 56 | parser.add_argument("-s", "--buffer_size", type=int, 57 | help="Replay Buffer size, default: 1000000") 58 | parser.add_argument("-a", "--n_drl", type=int, 59 | help="number of drl updates before an srl update, default: 100000") 60 | parser.add_argument("-b", "--batch_size", type=int, 61 | help="number of samples in each batch, default: 64") 62 | parser.add_argument("-i", "--steps_to_start_learn", type=int, 63 | help="number of steps before the agents starts learning, default: 10000") 64 | parser.add_argument("-c", "--test_iter", type=int, 65 | help="number of iterations between policy testing, default: 10000") 66 | # for playing 67 | parser.add_argument("-x", "--record", help="Directory to store video recording") 68 | parser.add_argument("--no-visualize", default=True, action='store_false', dest='visualize', 69 | help="Disable visualization of the game play") 70 | 71 | args = parser.parse_args() 72 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 73 | if args.lsddpg: 74 | use_ls_ddpg = True 75 | else: 76 | use_ls_ddpg = False 77 | if args.boosting: 78 | use_boosting = True 79 | lam = 1000 80 | else: 81 | use_boosting = False 82 | lam = 1 83 | 84 | # Training 85 | if args.train: 86 | if args.name: 87 | model_name = args.name 88 | else: 89 | model_name = '' 90 | if args.decay_rate: 91 | decay_rate = args.decay_rate 92 | else: 93 | decay_rate = None 94 | if args.lr_actor: 95 | lr_actor = args.lr_actor 96 | else: 97 | lr_actor = 0.0001 98 | if args.lr_critic: 99 | lr_critic = args.lr_critic 100 | else: 101 | lr_critic = 0.0001 102 | if args.lam: 103 | lam = args.lam 104 | if args.gamma: 105 | gamma = args.gamma 106 | else: 107 | gamma = 0.99 108 | if args.buffer_size: 109 | replay_size = args.buffer_size 110 | else: 111 | replay_size = 100000 112 | if args.n_drl: 113 | n_drl = args.n_drl 114 | else: 115 | n_drl = 100000 # steps of DRL between SRL 116 | if args.batch_size: 117 | batch_size = args.batch_size 118 | else: 119 | batch_size = 64 120 | if args.steps_to_start_learn: 121 | steps_to_start_learn = args.steps_to_start_learn 122 | else: 123 | steps_to_start_learn = 10000 124 | if args.test_iter: 125 | test_iter = args.test_iter 126 | else: 127 | test_iter = 10000 128 | 129 | # training_random_seed = 2019 130 | save_freq = 50000 131 | n_srl = replay_size # size of batch in SRL step 132 | # use_constant_seed = False # to compare performance independently of the randomness 133 | 134 | model_saving_path = './agent_ckpt/agent_' + model_name + ".pth" 135 | # if use_constant_seed: 136 | # model_name += "-SEED-" + str(training_random_seed) 137 | # np.random.seed(training_random_seed) 138 | # random.seed(training_random_seed) 139 | # env.seed(training_random_seed) 140 | # torch.manual_seed(training_random_seed) 141 | # if torch.cuda.is_available(): 142 | # torch.cuda.manual_seed_all(training_random_seed) 143 | # print("training using constant seed of ", training_random_seed) 144 | env = gym.make(args.env) 145 | test_env = gym.make(args.env) 146 | name = model_name + "_agent_" + args.env 147 | if use_ls_ddpg: 148 | print("using LS-DDPG") 149 | name += "-LS-LAM-" + str(lam) + "-" + str(int(1.0 * n_drl / 1000)) + "K" 150 | if use_boosting: 151 | print("using boosting") 152 | name += "-BOOSTING" 153 | # if use_constant_seed: 154 | # name += "-SEED-" + str(training_random_seed) 155 | # np.random.seed(training_random_seed) 156 | # random.seed(training_random_seed) 157 | # env.seed(training_random_seed) 158 | # test_env.seed(training_random_seed) 159 | # torch.manual_seed(training_random_seed) 160 | # if torch.cuda.is_available(): 161 | # torch.cuda.manual_seed_all(training_random_seed) 162 | # print("training using constant seed of ", training_random_seed) 163 | name += "-BATCH-" + str(batch_size) 164 | save_path = os.path.join("saves", "ddpg-" + name) 165 | os.makedirs(save_path, exist_ok=True) 166 | ckpt_save_path = './agent_ckpt/' + name + ".pth" 167 | if not os.path.exists('./agent_ckpt/'): 168 | os.makedirs('./agent_ckpt') 169 | 170 | act_net = agent_model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) 171 | crt_net = agent_model.DDPGCritic(env.observation_space.shape[0], env.action_space.shape[0]).to(device) 172 | print(act_net) 173 | print(crt_net) 174 | tgt_act_net = agent_model.TargetNet(act_net) 175 | tgt_crt_net = agent_model.TargetNet(crt_net) 176 | 177 | writer = SummaryWriter(comment="-ddpg-" + name) 178 | if decay_rate is not None: 179 | agent = agent_model.AgentDDPG(act_net, device=device, ou_decay_steps=decay_rate) 180 | else: 181 | agent = agent_model.AgentDDPG(act_net, device=device) 182 | exp_source = Experience.ExperienceSourceFirstLast(env, agent, gamma=gamma, steps_count=1) 183 | buffer = Experience.ExperienceReplayBuffer(exp_source, buffer_size=replay_size) 184 | if args.optimizer and args.optimizer == "RMSprop": 185 | act_opt = optim.RMSprop(act_net.parameters(), lr=lr_actor) 186 | crt_opt = optim.RMSprop(crt_net.parameters(), lr=lr_critic) 187 | else: 188 | act_opt = optim.Adam(act_net.parameters(), lr=lr_actor) 189 | crt_opt = optim.Adam(crt_net.parameters(), lr=lr_critic) 190 | 191 | utils.load_agent_state(act_net, crt_net, [act_opt, crt_opt], path=ckpt_save_path) 192 | 193 | frame_idx = 0 194 | drl_updates = 0 195 | best_reward = None 196 | with utils.RewardTracker(writer) as tracker: 197 | with utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: 198 | while True: 199 | frame_idx += 1 200 | buffer.populate(1) 201 | rewards_steps = exp_source.pop_rewards_steps() 202 | if rewards_steps: 203 | rewards, steps = zip(*rewards_steps) 204 | tb_tracker.track("episode_steps", steps[0], frame_idx) 205 | mean_reward = tracker.reward(rewards[0], frame_idx) 206 | if mean_reward is not None and mean_reward > REWARD_TO_SOLVE: 207 | print("environment solved in % steps" % frame_idx, 208 | " (% episodes)" % len(tracker.total_rewards)) 209 | utils.save_agent_state(act_net, crt_net, [act_opt, crt_opt], frame_idx, 210 | len(tracker.total_rewards), path=ckpt_save_path) 211 | break 212 | 213 | if len(buffer) < steps_to_start_learn: 214 | continue 215 | 216 | batch = buffer.sample(batch_size) 217 | states_v, actions_v, rewards_v, dones_mask, last_states_v = utils.unpack_batch(batch, device) 218 | 219 | # train critic 220 | crt_opt.zero_grad() 221 | q_v = crt_net(states_v, actions_v) 222 | last_act_v = tgt_act_net.target_model(last_states_v) 223 | q_last_v = tgt_crt_net.target_model(last_states_v, last_act_v) 224 | q_last_v[dones_mask] = 0.0 225 | q_ref_v = rewards_v.unsqueeze(dim=-1) + q_last_v * gamma 226 | critic_loss_v = F.mse_loss(q_v, q_ref_v.detach()) 227 | critic_loss_v.backward() 228 | crt_opt.step() 229 | tb_tracker.track("loss_critic", critic_loss_v, frame_idx) 230 | tb_tracker.track("critic_ref", q_ref_v.mean(), frame_idx) 231 | 232 | drl_updates += 1 233 | # LS-UPDATE STEP for Critic (Q) 234 | if use_ls_ddpg and (drl_updates % n_drl == 0) and (len(buffer) >= n_srl): 235 | # if len(buffer) > 1: 236 | print("performing ls step...") 237 | batch = buffer.sample(n_srl) 238 | ls_step([act_net, crt_net], [tgt_act_net, tgt_crt_net], batch, gamma, len(buffer), 239 | lam=lam, m_batch_size=256, device=device, use_boosting=use_boosting) 240 | 241 | # train actor 242 | act_opt.zero_grad() 243 | cur_actions_v = act_net(states_v) 244 | actor_loss_v = -crt_net(states_v, cur_actions_v) 245 | actor_loss_v = actor_loss_v.mean() 246 | actor_loss_v.backward() 247 | act_opt.step() 248 | tb_tracker.track("loss_actor", actor_loss_v, frame_idx) 249 | 250 | tgt_act_net.alpha_sync(alpha=1 - 1e-3) 251 | tgt_crt_net.alpha_sync(alpha=1 - 1e-3) 252 | 253 | if frame_idx % test_iter == 0: 254 | ts = time.time() 255 | rewards, steps = test_net(act_net, test_env, agent_model, device=device) 256 | print("Test done in %.2f sec, reward %.3f, steps %d" % ( 257 | time.time() - ts, rewards, steps)) 258 | writer.add_scalar("test_reward", rewards, frame_idx) 259 | writer.add_scalar("test_steps", steps, frame_idx) 260 | if best_reward is None or best_reward < rewards: 261 | if best_reward is not None: 262 | print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) 263 | name = "best_%+.3f_%d.dat" % (rewards, frame_idx) 264 | fname = os.path.join(save_path, name) 265 | torch.save(act_net.state_dict(), fname) 266 | utils.save_agent_state(act_net, crt_net, [act_opt, crt_opt], frame_idx, 267 | len(tracker.total_rewards), path=ckpt_save_path) 268 | best_reward = rewards 269 | 270 | pass 271 | elif args.play: 272 | # play 273 | if args.path: 274 | path_to_model_ckpt = args.path 275 | else: 276 | raise SystemExit("must include path to agent checkpoint") 277 | render = True 278 | spec = gym.envs.registry.spec(args.env) 279 | if spec._kwargs.get('render') and render: 280 | spec._kwargs['render'] = True 281 | env = gym.make(args.env) 282 | use_constant_seed = True 283 | seed = 2019 284 | if use_constant_seed: 285 | np.random.seed(seed) 286 | random.seed(seed) 287 | env.seed(seed) 288 | torch.manual_seed(seed) 289 | if torch.cuda.is_available(): 290 | torch.cuda.manual_seed_all(seed) 291 | print("seed set to ", seed) 292 | if args.record: 293 | # pass 294 | env = gym.wrappers.Monitor(env, args.record) 295 | 296 | net = agent_model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]) 297 | net.load_state_dict(torch.load(path_to_model_ckpt)) 298 | 299 | obs = env.reset() 300 | total_reward = 0.0 301 | total_steps = 0 302 | while True: 303 | obs_v = torch.FloatTensor([obs]) 304 | mu_v = net(obs_v) 305 | action = mu_v.squeeze(dim=0).data.numpy() 306 | action = np.clip(action, -1, 1) 307 | obs, reward, done, _ = env.step(action) 308 | total_reward += reward 309 | total_steps += 1 310 | if render: 311 | env.render() 312 | if done: 313 | env.close() 314 | break 315 | print("In %d steps we got %.3f reward" % (total_steps, total_reward)) 316 | else: 317 | raise SystemExit("must choose between train or play") 318 | 319 | --------------------------------------------------------------------------------