├── images
    ├── bipedal.gif
    ├── minitaur.gif
    └── ddpg_graph.png
├── writeup
    └── Deep_RL_Shallow_Updates_for_Deep_Reinforcement_Learning.pdf
├── play_ls_ddpg.py
├── README.md
├── utils
    ├── nn_agent_models.py
    ├── utils.py
    ├── Experience.py
    └── srl_algorithms.py
├── train_ls_ddpg.py
└── ls_ddpg_main.py


/images/bipedal.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/pytorch-ls-ddpg/master/images/bipedal.gif


--------------------------------------------------------------------------------
/images/minitaur.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/pytorch-ls-ddpg/master/images/minitaur.gif


--------------------------------------------------------------------------------
/images/ddpg_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/pytorch-ls-ddpg/master/images/ddpg_graph.png


--------------------------------------------------------------------------------
/writeup/Deep_RL_Shallow_Updates_for_Deep_Reinforcement_Learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taldatech/pytorch-ls-ddpg/master/writeup/Deep_RL_Shallow_Updates_for_Deep_Reinforcement_Learning.pdf


--------------------------------------------------------------------------------
/play_ls_ddpg.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import gym
 3 | import pybullet_envs
 4 | 
 5 | import utils.nn_agent_models as agent_model
 6 | 
 7 | import numpy as np
 8 | import torch
 9 | 
10 | # ENV_ID = "MinitaurBulletEnv-v0"
11 | ENV_ID = "BipedalWalker-v2"
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("-m", "--model", required=True, help="Model file to load")
17 |     parser.add_argument("-e", "--env", default=ENV_ID, help="Environment name to use, default=" + ENV_ID)
18 |     parser.add_argument("-r", "--record", help="If specified, sets the recording dir, default=Disabled")
19 |     args = parser.parse_args()
20 |     render = True
21 |     spec = gym.envs.registry.spec(args.env)
22 |     if spec._kwargs.get('render') and render:
23 |         spec._kwargs['render'] = True
24 |     env = gym.make(args.env)
25 |     if args.record:
26 |         env = gym.wrappers.Monitor(env, args.record)
27 | 
28 |     net = agent_model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0])
29 |     net.load_state_dict(torch.load(args.model))
30 | 
31 |     obs = env.reset()
32 |     total_reward = 0.0
33 |     total_steps = 0
34 |     while True:
35 |         obs_v = torch.FloatTensor([obs])
36 |         mu_v = net(obs_v)
37 |         action = mu_v.squeeze(dim=0).data.numpy()
38 |         action = np.clip(action, -1, 1)
39 |         obs, reward, done, _ = env.step(action)
40 |         total_reward += reward
41 |         total_steps += 1
42 |         if render:
43 |             env.render()
44 |         if done:
45 |             env.close()
46 |             break
47 |     print("In %d steps we got %.3f reward" % (total_steps, total_reward))
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pytorch-ls-ddpg
  2 | PyTorch Implementation of Least-Squares Deep Deterministic Policy Gradients
  3 | 
  4 | Based on the paper:
  5 | 
  6 | Nir Levine, Tom Zahavy, Daniel J. Mankowitz, Aviv Tamar, Shie Mannor [Shallow Updates for Deep Reinforcement Learning](https://arxiv.org/abs/1705.07461), NIPS 2017
  7 | 
  8 | Video:
  9 | 
 10 | [YouTube](https://youtu.be/i8Cnas7QrMc) - https://youtu.be/i8Cnas7QrMc
 11 | 
 12 | 
 13 | ![minitaur](https://github.com/taldatech/pytorch-ls-ddpg/blob/master/images/minitaur.gif)
 14 | ![bipedal](https://github.com/taldatech/pytorch-ls-ddpg/blob/master/images/bipedal.gif)
 15 | 
 16 | ![ddpg](https://github.com/taldatech/pytorch-ls-ddpg/blob/master/images/ddpg_graph.png)
 17 | 
 18 | [LS-DQN](https://github.com/taldatech/pytorch-ls-dqn) - https://github.com/taldatech/pytorch-ls-dqn
 19 | 
 20 | - [pytorch-ls-ddpg](#pytorch-ls-ddpg)
 21 |   * [Background](#background)
 22 |   * [Prerequisites](#prerequisites)
 23 |   * [Files in the repository](#files-in-the-repository)
 24 |   * [API (`ls_ddpg_main.py --help`)](#api---ls-ddpg-mainpy---help--)
 25 |   * [Playing](#playing)
 26 |   * [Training](#training)
 27 |   * [TensorBoard](#tensorboard)
 28 |   * [References](#references)
 29 | 
 30 | ## Background
 31 | The idea of this algorithm is to combine between Deep RL (DRL) to Shallow RL (SRL), where in this case, we use Deep Deterministic Policy Gradient (DDPG) as the DRL algorithm and
 32 | Fitted Q-Iteration (FQI) and the Boosted version (B-FQI) as the SRL algorithm (which can be approximated using least-squares, full derivation is in the original paper).
 33 | Every N_DRL (number of DDPG Critic backprop steps) we apply LS-UPDATE to the very last layer of the Critic NN, by using the complete Replay Buffer, a fetaure augmentation technique and
 34 | Bayesian regularization (prevents overfitting, makes the LS matrix invertible) to solve the FQI equations.
 35 | The assumptions are that the features extracted from the last layer form a rich representation, and that the large batch size used by the SRL algorithm improves stability and performance.
 36 | For full derivations and theory, please refer to the original paper.
 37 | 
 38 | ## Prerequisites
 39 | |Library         | Version |
 40 | |----------------------|----|
 41 | |`Python`|  `3.5.5 (Anaconda)`|
 42 | |`torch`|  `0.4.1`|
 43 | |`gym`|  `0.10.9`|
 44 | |`tensorboard`|  `1.12.0`|
 45 | |`tensorboardX`|  `1.5`|
 46 | |`pybullet`| `2.4.2`, https://pypi.org/project/pybullet/|
 47 | |`Box2D`| `2.3.8`|
 48 | 
 49 | ## Files in the repository
 50 | 
 51 | |File name         | Purpsoe |
 52 | |----------------------|------|
 53 | |`ls_ddpg_main.py`| general purpose main application for training/playing a LS-DDPG agent|
 54 | |`play_ls_ddpg.py`| sample code for playing an environment, also in `ls_ddpg_main.py`|
 55 | |`train_ls_ddpg.py`| sample code for training an environment, also in `ls_ddpg_main.py`|
 56 | |`nn_agent_models.py`| agent and DDPG classes, holds the network, action selector and current state|
 57 | |`Experience.py`| Replay Buffer classes|
 58 | |`srl_algorithms.py`| Shallow RL algorithms, LS-UPDATE|
 59 | |`utils.py`| utility functions|
 60 | |`*.pth` / `*.dat`| Checkpoint files for the Agents (playing/continual learning)|
 61 | |`Deep_RL_Shallow_Updates_for_Deep_Reinforcement_Learning.pdf`| Writeup - theory and results|
 62 | 
 63 | ## API (`ls_ddpg_main.py --help`)
 64 | 
 65 | 
 66 | You should use the `ls_ddpg_main.py` file with the following arguments:
 67 | 
 68 | |Argument                 | Description                                 |
 69 | |-------------------------|---------------------------------------------|
 70 | |-h, --help       | shows arguments description             |
 71 | |-t, --train     | train or continue training an agent  |
 72 | |-p, --play    | play the environment using an a pretrained agent |
 73 | |-n, --name       | model name, for saving and loading |
 74 | |-k, --lsddpg	| use LS-DDPG (apply LS-UPDATE every N_DRL), default: false |
 75 | |-j, --boosting| use Boosted-FQI as SRL algorithm, default: false |
 76 | |-y, --path| path to agent checkpoint, for playing |
 77 | |-e, --env| environment to play: MinitaurBulletEnv-v0, BipedalWalker-v2, default="BipedalWalker-v2" |
 78 | |-d, --decay_rate| number of episodes for epsilon decaying, default: 500000 |
 79 | |-o, --optimizer| optimizing algorithm ('RMSprop', 'Adam'), deafult: 'Adam' |
 80 | |--lr_critic| learning rate for the Critic optimizer, default: 0.0001 |
 81 | |--lr_actor| learning rate for the Actor optimizer, default: 0.0001 |
 82 | |-g, --gamma| gamma parameter for the Q-Learning, default: 0.99 |
 83 | |-l, --lam| regularization parameter value, default: 1, 10000 (boosting) |
 84 | |-s, --buffer_size| Replay Buffer size, default: 1000000 |
 85 | |-b, --batch_size| number of samples in each batch, default: 64 |
 86 | |-i, --steps_to_start_learn| number of steps before the agents starts learning, default: 10000 |
 87 | |-c, --test_iter| number of iterations between policy testing, default: 10000 |
 88 | |-x, --record| Directory to store video recording when playing (only Linux) |
 89 | |--no-visualize| if not typed, render the environment when playing |
 90 | 
 91 | ## Playing
 92 | Agents checkpoints (files ending with `.pth`) are saved and loaded from the `agent_ckpt` directory.
 93 | Playing a pretrained agent for one episode:
 94 | 
 95 | `python ls_ddpg_main.py --play -y ./saves/ddpg-agent_BipedalWalker-v2-LS-LAM-10000-100K-BOOSTING-SEED-2019-BATCH-64/best_+316.064_2410000.dat -x ./Videos/`
 96 | 
 97 | ## Training
 98 | 
 99 | Examples:
100 | 
101 | * `python ls_ddpg_main.py --train --lsddpg -e MinitaurBulletEnv-v0 -l 1 -b 64`
102 | * `python ls_ddpg_main.py --train --lsddpg --boosting -e BipedalWalker-v2 -l 10000 -b 64`
103 | 
104 | For full description of the flags, see the full API.
105 | 
106 | ## TensorBoard
107 | 
108 | TensorBoard logs are written dynamically during the runs, and it possible to observe the training progress using the graphs. In order to open TensoBoard, navigate to the source directory of the project and in the terminal/cmd:
109 | 
110 | `tensorboard --logdir=./runs`
111 | 
112 | * make sure you have the correct environment activated (`conda activate env-name`) and that you have `tensorboard`, `tensorboardX` installed.
113 | 
114 | ## References
115 | * [PyTorch Agent Net: reinforcement learning toolkit for pytorch](https://github.com/Shmuma/ptan) by [Max Lapan](https://github.com/Shmuma)
116 | * Nir Levine, Tom Zahavy, Daniel J. Mankowitz, Aviv Tamar, Shie Mannor [Shallow Updates for Deep Reinforcement Learning](https://arxiv.org/abs/1705.07461), NIPS 2017
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/utils/nn_agent_models.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Base models taken from the PTAN (PyTorch Agent Net) library by Shmuma
  3 | https://github.com/Shmuma/ptan
  4 | """
  5 | # imports
  6 | import torch
  7 | import torch.nn as nn
  8 | import numpy as np
  9 | import copy
 10 | 
 11 | HID_SIZE = 128
 12 | 
 13 | 
 14 | def float32_preprocessor(states):
 15 |     np_states = np.array(states, dtype=np.float32)
 16 |     return torch.tensor(np_states)
 17 | 
 18 | 
 19 | class TargetNet:
 20 |     """
 21 |     Wrapper around model which provides copy of it instead of trained weights
 22 |     """
 23 | 
 24 |     def __init__(self, model):
 25 |         self.model = model
 26 |         self.target_model = copy.deepcopy(model)
 27 | 
 28 |     def sync(self):
 29 |         self.target_model.load_state_dict(self.model.state_dict())
 30 | 
 31 |     def alpha_sync(self, alpha):
 32 |         """
 33 |         Blend params of target net with params from the model
 34 |         :param alpha:
 35 |         """
 36 |         assert isinstance(alpha, float)
 37 |         assert 0.0 < alpha <= 1.0
 38 |         state = self.model.state_dict()
 39 |         tgt_state = self.target_model.state_dict()
 40 |         for k, v in state.items():
 41 |             tgt_state[k] = tgt_state[k] * alpha + (1 - alpha) * v
 42 |         self.target_model.load_state_dict(tgt_state)
 43 | 
 44 | 
 45 | class BaseAgent:
 46 |     """
 47 |     Abstract Agent interface
 48 |     """
 49 | 
 50 |     def initial_state(self):
 51 |         """
 52 |         Should create initial empty state for the agent. It will be called for the start of the episode
 53 |         :return: Anything agent want to remember
 54 |         """
 55 |         return None
 56 | 
 57 |     def __call__(self, states, agent_states):
 58 |         """
 59 |         Convert observations and states into actions to take
 60 |         :param states: list of environment states to process
 61 |         :param agent_states: list of states with the same length as observations
 62 |         :return: tuple of actions, states
 63 |         """
 64 |         assert isinstance(states, list)
 65 |         assert isinstance(agent_states, list)
 66 |         assert len(agent_states) == len(states)
 67 | 
 68 |         raise NotImplementedError
 69 | 
 70 | 
 71 | class DDPGActor(nn.Module):
 72 |     def __init__(self, obs_size, act_size):
 73 |         super(DDPGActor, self).__init__()
 74 | 
 75 |         self.fc1 = nn.Sequential(nn.Linear(obs_size, 400),
 76 |                                  nn.ReLU()
 77 |                                  )
 78 |         # self.fc2 = nn.Sequential(nn.Linear(400, 300),
 79 |         #                          nn.ReLU()
 80 |         #                          )
 81 |         self.fc2 = nn.Linear(400, 300)
 82 |         self.relu_2 = nn.ReLU()
 83 |         self.fc3 = nn.Linear(300, act_size)
 84 |         self.tanh_3 = nn.Tanh()
 85 |         # self.net = nn.Sequential(
 86 |         #     nn.Linear(obs_size, 400),
 87 |         #     nn.ReLU(),
 88 |         #     nn.Linear(400, 300),
 89 |         #     nn.ReLU(),
 90 |         #     nn.Linear(300, act_size),
 91 |         #     nn.Tanh()
 92 |         # )
 93 | 
 94 |     def forward(self, x):
 95 |         return self.tanh_3(self.fc3(self.relu_2(self.fc2(self.fc1(x)))))
 96 | 
 97 |     def forward_to_last_hidden(self, x):
 98 |         return self.relu_2(self.fc2(self.fc1(x)))
 99 | 
100 | 
101 | class DDPGCritic(nn.Module):
102 |     def __init__(self, obs_size, act_size):
103 |         super(DDPGCritic, self).__init__()
104 | 
105 |         self.obs_net = nn.Sequential(
106 |             nn.Linear(obs_size, 400),
107 |             nn.ReLU()
108 |         )
109 | 
110 |         self.out_fc1 = nn.Linear(400 + act_size, 300)
111 |         self.relu_1 = nn.ReLU()
112 | 
113 |         self.out_fc2 = nn.Linear(300, 1)
114 | 
115 |         # self.out_net = nn.Sequential(
116 |         #     nn.Linear(400 + act_size, 300),
117 |         #     nn.ReLU(),
118 |         #     nn.Linear(300, 1)
119 |         # )
120 | 
121 |     def forward(self, x, a):
122 |         obs = self.obs_net(x)
123 |         return self.out_fc2(self.relu_1(self.out_fc1((torch.cat([obs, a], dim=1)))))
124 | 
125 |     def forward_to_last_hidden(self, x, a):
126 |         obs = self.obs_net(x)
127 |         return self.relu_1(self.out_fc1((torch.cat([obs, a], dim=1))))
128 | 
129 | 
130 | class AgentDDPG(BaseAgent):
131 |     """
132 |     Agent implementing Orstein-Uhlenbeck exploration process.
133 |     # Implemented noise decaying for convergence
134 |     """
135 | 
136 |     def __init__(self, net, device="cpu", ou_enabled=True, ou_mu=0.0, ou_teta=0.15, ou_sigma=0.2, ou_epsilon=1.0,
137 |                  ou_decay_steps=500000, ou_epsilon_end=0.01, use_decaying_noise=True):
138 |         self.net = net
139 |         self.device = device
140 |         self.ou_enabled = ou_enabled
141 |         self.ou_mu = ou_mu
142 |         self.ou_teta = ou_teta
143 |         self.ou_sigma = ou_sigma
144 |         self.ou_epsilon = ou_epsilon
145 |         self.ou_decay_steps = ou_decay_steps
146 |         self.ou_epsilon_end = ou_epsilon_end
147 |         self.use_decaying_noise = use_decaying_noise
148 |         self.num_agent_calls = 0
149 | 
150 |     def initial_state(self):
151 |         return None
152 | 
153 |     def __call__(self, states, agent_states):
154 |         states_v = float32_preprocessor(states).to(self.device)
155 |         # we use the deterministic output of the actor as the expected value
156 |         mu_v = self.net(states_v)
157 |         actions = mu_v.data.cpu().numpy()
158 | 
159 |         if self.ou_enabled and self.ou_epsilon > 0:
160 |             new_a_states = []
161 |             for a_state, action in zip(agent_states, actions):
162 |                 if a_state is None:
163 |                     # initialization of the OU process
164 |                     a_state = np.zeros(shape=action.shape, dtype=np.float32)
165 |                 a_state += self.ou_teta * (self.ou_mu - a_state)
166 |                 a_state += self.ou_sigma * np.random.normal(size=action.shape)
167 |                 if self.use_decaying_noise:
168 |                     epsilon = max(self.ou_epsilon_end, self.ou_epsilon - self.num_agent_calls / self.ou_decay_steps)
169 |                 else:
170 |                     epsilon = self.ou_epsilon
171 |                 action += epsilon * a_state
172 |                 new_a_states.append(a_state)
173 |         else:
174 |             new_a_states = agent_states
175 | 
176 |         self.num_agent_calls += 1
177 |         actions = np.clip(actions, -1, 1)
178 |         return actions, new_a_states
179 | 


--------------------------------------------------------------------------------
/train_ls_ddpg.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import gym
  4 | import pybullet_envs
  5 | # import argparse
  6 | from tensorboardX import SummaryWriter
  7 | import numpy as np
  8 | import utils.nn_agent_models as agent_model
  9 | import utils.Experience as Experience
 10 | import utils.utils as utils
 11 | # import utils
 12 | from utils.srl_algorithms import ls_step
 13 | import torch
 14 | import torch.optim as optim
 15 | import torch.nn.functional as F
 16 | import random
 17 | from utils.utils import test_net
 18 | 
 19 | # ENV_ID = "MinitaurBulletEnv-v0"
 20 | ENV_ID = "BipedalWalker-v2"
 21 | GAMMA = 0.99
 22 | BATCH_SIZE = 64
 23 | LEARNING_RATE_ACTOR = 0.0001
 24 | LEARNING_RATE_CRITIC = 0.0001
 25 | REPLAY_SIZE = 100000
 26 | REPLAY_INITIAL = 10000  # 10000
 27 | N_DRL = 100000
 28 | N_SRL = REPLAY_SIZE
 29 | REWARD_TO_SOLVE = 300
 30 | 
 31 | 
 32 | TEST_ITERS = 10000  # 1000 for Minitaure
 33 | 
 34 | 
 35 | if __name__ == "__main__":
 36 |     # parser = argparse.ArgumentParser()
 37 |     # parser.add_argument("--cuda", default=False, action='store_true', help='Enable CUDA')
 38 |     # parser.add_argument("-n", "--name", required=True, help="Name of the run")
 39 |     # args = parser.parse_args()
 40 |     training_random_seed = 2019
 41 |     use_constant_seed = True  # to compare performance independently of the randomness
 42 |     use_ls_ddpg = True
 43 |     use_boosting = False
 44 |     lam = 10  # regularization parameter
 45 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 46 |     env = gym.make(ENV_ID)
 47 |     test_env = gym.make(ENV_ID)
 48 |     name = "agent_" + ENV_ID
 49 |     if use_ls_ddpg:
 50 |         print("using LS-DDPG")
 51 |         name += "-LS-LAM-" + str(lam) + "-" + str(int(1.0 * N_DRL / 1000)) + "K"
 52 |     if use_boosting:
 53 |         print("using boosting")
 54 |         name += "-BOOSTING"
 55 |     if use_constant_seed:
 56 |         name += "-SEED-" + str(training_random_seed)
 57 |         np.random.seed(training_random_seed)
 58 |         random.seed(training_random_seed)
 59 |         env.seed(training_random_seed)
 60 |         test_env.seed(training_random_seed)
 61 |         torch.manual_seed(training_random_seed)
 62 |         if torch.cuda.is_available():
 63 |             torch.cuda.manual_seed_all(training_random_seed)
 64 |         print("training using constant seed of ", training_random_seed)
 65 |     name += "-BATCH-" + str(BATCH_SIZE)
 66 |     save_path = os.path.join("saves", "ddpg-" + name)
 67 |     os.makedirs(save_path, exist_ok=True)
 68 |     ckpt_save_path = './agent_ckpt/' + name + ".pth"
 69 |     if not os.path.exists('./agent_ckpt/'):
 70 |         os.makedirs('./agent_ckpt')
 71 | 
 72 |     act_net = agent_model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device)
 73 |     crt_net = agent_model.DDPGCritic(env.observation_space.shape[0], env.action_space.shape[0]).to(device)
 74 |     print(act_net)
 75 |     print(crt_net)
 76 |     tgt_act_net = agent_model.TargetNet(act_net)
 77 |     tgt_crt_net = agent_model.TargetNet(crt_net)
 78 | 
 79 |     writer = SummaryWriter(comment="-ddpg-" + name)
 80 |     agent = agent_model.AgentDDPG(act_net, device=device)
 81 |     exp_source = Experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=1)
 82 |     buffer = Experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE)
 83 |     act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE_ACTOR)
 84 |     crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE_CRITIC)
 85 | 
 86 |     utils.load_agent_state(act_net, crt_net, [act_opt, crt_opt], path=ckpt_save_path)
 87 | 
 88 |     frame_idx = 0
 89 |     drl_updates = 0
 90 |     best_reward = None
 91 |     with utils.RewardTracker(writer) as tracker:
 92 |         with utils.TBMeanTracker(writer, batch_size=10) as tb_tracker:
 93 |             while True:
 94 |                 frame_idx += 1
 95 |                 buffer.populate(1)
 96 |                 rewards_steps = exp_source.pop_rewards_steps()
 97 |                 if rewards_steps:
 98 |                     rewards, steps = zip(*rewards_steps)
 99 |                     tb_tracker.track("episode_steps", steps[0], frame_idx)
100 |                     mean_reward = tracker.reward(rewards[0], frame_idx)
101 |                     if mean_reward is not None and mean_reward > REWARD_TO_SOLVE:
102 |                         print("environment solved in % steps" % frame_idx, " (% episodes)" % len(tracker.total_rewards))
103 |                         utils.save_agent_state(act_net, crt_net, [act_opt, crt_opt], frame_idx,
104 |                                                len(tracker.total_rewards), path=ckpt_save_path)
105 |                         break
106 | 
107 |                 if len(buffer) < REPLAY_INITIAL:
108 |                     continue
109 | 
110 |                 batch = buffer.sample(BATCH_SIZE)
111 |                 states_v, actions_v, rewards_v, dones_mask, last_states_v = utils.unpack_batch(batch, device)
112 | 
113 |                 # train critic
114 |                 crt_opt.zero_grad()
115 |                 q_v = crt_net(states_v, actions_v)
116 |                 last_act_v = tgt_act_net.target_model(last_states_v)
117 |                 q_last_v = tgt_crt_net.target_model(last_states_v, last_act_v)
118 |                 q_last_v[dones_mask] = 0.0
119 |                 q_ref_v = rewards_v.unsqueeze(dim=-1) + q_last_v * GAMMA
120 |                 critic_loss_v = F.mse_loss(q_v, q_ref_v.detach())
121 |                 critic_loss_v.backward()
122 |                 crt_opt.step()
123 |                 tb_tracker.track("loss_critic", critic_loss_v, frame_idx)
124 |                 tb_tracker.track("critic_ref", q_ref_v.mean(), frame_idx)
125 | 
126 |                 drl_updates += 1
127 |                 # LS-UPDATE STEP for Critic (Q)
128 |                 if use_ls_ddpg and (drl_updates % N_DRL == 0) and (len(buffer) >= N_SRL):
129 |                     # if len(buffer) > 1:
130 |                     print("performing ls step...")
131 |                     batch = buffer.sample(N_SRL)
132 |                     ls_step([act_net, crt_net], [tgt_act_net, tgt_crt_net], batch, GAMMA, len(buffer),
133 |                             lam=lam, m_batch_size=256, device=device, use_boosting=use_boosting)
134 | 
135 |                 # train actor
136 |                 act_opt.zero_grad()
137 |                 cur_actions_v = act_net(states_v)
138 |                 actor_loss_v = -crt_net(states_v, cur_actions_v)
139 |                 actor_loss_v = actor_loss_v.mean()
140 |                 actor_loss_v.backward()
141 |                 act_opt.step()
142 |                 tb_tracker.track("loss_actor", actor_loss_v, frame_idx)
143 | 
144 |                 tgt_act_net.alpha_sync(alpha=1 - 1e-3)
145 |                 tgt_crt_net.alpha_sync(alpha=1 - 1e-3)
146 | 
147 |                 if frame_idx % TEST_ITERS == 0:
148 |                     ts = time.time()
149 |                     rewards, steps = test_net(act_net, test_env, agent_model, device=device)
150 |                     print("Test done in %.2f sec, reward %.3f, steps %d" % (
151 |                         time.time() - ts, rewards, steps))
152 |                     writer.add_scalar("test_reward", rewards, frame_idx)
153 |                     writer.add_scalar("test_steps", steps, frame_idx)
154 |                     if best_reward is None or best_reward < rewards:
155 |                         if best_reward is not None:
156 |                             print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards))
157 |                             name = "best_%+.3f_%d.dat" % (rewards, frame_idx)
158 |                             fname = os.path.join(save_path, name)
159 |                             torch.save(act_net.state_dict(), fname)
160 |                             utils.save_agent_state(act_net, crt_net, [act_opt, crt_opt], frame_idx,
161 |                                                    len(tracker.total_rewards), path=ckpt_save_path)
162 |                         best_reward = rewards
163 | 
164 |     pass
165 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility functions.
  3 | Most taken from the PTAN (PyTorch Agent Net) library by Shmuma
  4 | https://github.com/Shmuma/ptan
  5 | """
  6 | 
  7 | # imports
  8 | import numpy as np
  9 | import torch
 10 | import time
 11 | import sys
 12 | import collections
 13 | import os
 14 | from utils.nn_agent_models import float32_preprocessor
 15 | 
 16 | 
 17 | def test_net(net, env, agent_model, count=10, device="cpu"):
 18 |     rewards = 0.0
 19 |     steps = 0
 20 |     for _ in range(count):
 21 |         obs = env.reset()
 22 |         while True:
 23 |             obs_v = agent_model.float32_preprocessor([obs]).to(device)
 24 |             mu_v = net(obs_v)
 25 |             action = mu_v.squeeze(dim=0).data.cpu().numpy()
 26 |             action = np.clip(action, -1, 1)
 27 |             obs, reward, done, _ = env.step(action)
 28 |             rewards += reward
 29 |             steps += 1
 30 |             if done:
 31 |                 break
 32 |     return rewards / count, steps / count
 33 | 
 34 | 
 35 | def save_agent_state(act_net, crt_net, optimizers, frame, games, save_replay=False, replay_buffer=None, name='',
 36 |                      path=None):
 37 |     """
 38 |     This function saves the current state of the NN (the weights) to a local file.
 39 |     :param act_net: the current actor NN (nn.Module)
 40 |     :param crt_net: the current critic NN (nn.Module)
 41 |     :param optimizers: the network's optimizer (torch.optim)
 42 |     :param frame: current frame number (int)
 43 |     :param games: total number of games seen (int)
 44 |     :param save_replay: whether or not to save the replay buffer (bool)
 45 |     :param replay_buffer: the replay buffer (list)
 46 |     :param name: specific name for the checkpoint (str)
 47 |     :param path: path to specific location where to save (str)
 48 |     """
 49 |     dir_name = './agent_ckpt'
 50 |     if path:
 51 |         full_path = path
 52 |     else:
 53 |         if name:
 54 |             filename = "agent_ls_ddpg_" + name + ".pth"
 55 |         else:
 56 |             filename = "agent_ls_ddpg.pth"
 57 |         dir_name = './agent_ckpt'
 58 |         full_path = os.path.join(dir_name, filename)
 59 |     if not os.path.exists(dir_name):
 60 |         os.makedirs(dir_name)
 61 |     if save_replay and replay_buffer is not None:
 62 |         torch.save({
 63 |             'act_state_dict': act_net.state_dict(),
 64 |             'crt_state_dict': crt_net.state_dict(),
 65 |             'act_optimizer_state_dict': optimizers[0].state_dict(),
 66 |             'crt_optimizer_state_dict': optimizers[1].state_dict(),
 67 |             'frame_count': frame,
 68 |             'games': games,
 69 |             'replay_buffer': replay_buffer
 70 |         }, full_path)
 71 |     else:
 72 |         torch.save({
 73 |             'act_state_dict': act_net.state_dict(),
 74 |             'crt_state_dict': crt_net.state_dict(),
 75 |             'act_optimizer_state_dict': optimizers[0].state_dict(),
 76 |             'crt_optimizer_state_dict': optimizers[1].state_dict(),
 77 |             'frame_count': frame,
 78 |             'games': games
 79 |         }, full_path)
 80 |     print("Saved Agent checkpoint @ ", full_path)
 81 | 
 82 | 
 83 | def load_agent_state(act_net, crt_net, optimizers, path=None, copy_to_target_network=False, load_optimizer=True,
 84 |                      target_nets=None, buffer=None, load_buffer=False):
 85 |     """
 86 |     This function loads a state of the NN (the weights) from a local file.
 87 |     :param act_net: the current actor NN (nn.Module)
 88 |     :param crt_net: the current critic NN (nn.Module)
 89 |     :param optimizers: the network's optimizers (torch.optim)
 90 |     :param path: full path to checkpoint file (.pth) (str)
 91 |     :param copy_to_target_network: whether or not to copy the weights to target network (bool)
 92 |     :param load_optimizer: whether or not to load the optimizer state (bool)
 93 |     :param load_buffer: whether or not to load the replay buffer (bool)
 94 |     :param buffer: the replay buffer
 95 |     :param target_nets: the target NNs
 96 |     """
 97 |     if path is None:
 98 |         raise SystemExit("path to model must be specified")
 99 |     else:
100 |         full_path = path
101 |     exists = os.path.isfile(full_path)
102 |     if exists:
103 |         if not torch.cuda.is_available():
104 |             checkpoint = torch.load(full_path, map_location='cpu')
105 |         else:
106 |             checkpoint = torch.load(full_path)
107 |         act_net.load_state_dict(checkpoint['act_state_dict'])
108 |         crt_net.load_state_dict(checkpoint['crt_state_dict'])
109 |         if load_optimizer:
110 |             optimizers[0].load_state_dict(checkpoint['act_optimizer_state_dict'])
111 |             optimizers[1].load_state_dict(checkpoint['crt_optimizer_state_dict'])
112 |         # self.steps_count = checkpoint['steps_count']
113 |         # self.episodes_seen = checkpoint['episodes_seen']
114 |         # selector.epsilon = checkpoint['epsilon']
115 |         # self.num_param_update = checkpoint['num_param_updates']
116 |         print("Checkpoint loaded successfully from ", full_path)
117 |         # # for manual loading a checkpoint
118 |         if copy_to_target_network and target_nets is not None:
119 |             target_nets[0].sync()
120 |             target_nets[1].sync()
121 |         if load_buffer and buffer is not None:
122 |             buffer.buffer = checkpoint['replay_buffer']
123 |     else:
124 |         print("No checkpoint found...")
125 | 
126 | 
127 | def unpack_batch(batch, device="cpu"):
128 |     states, actions, rewards, dones, last_states = [], [], [], [], []
129 |     for exp in batch:
130 |         states.append(exp.state)
131 |         actions.append(exp.action)
132 |         rewards.append(exp.reward)
133 |         dones.append(exp.last_state is None)
134 |         if exp.last_state is None:
135 |             last_states.append(exp.state)
136 |         else:
137 |             last_states.append(exp.last_state)
138 |     states_v = float32_preprocessor(states).to(device)
139 |     actions_v = float32_preprocessor(actions).to(device)
140 |     rewards_v = float32_preprocessor(rewards).to(device)
141 |     last_states_v = float32_preprocessor(last_states).to(device)
142 |     dones_t = torch.ByteTensor(dones).to(device)
143 |     return states_v, actions_v, rewards_v, dones_t, last_states_v
144 | 
145 | 
146 | class TBMeanTracker:
147 |     """
148 |     TensorBoard value tracker: allows to batch fixed amount of historical values and write their mean into TB
149 | 
150 |     Designed and tested with pytorch-tensorboard in mind
151 |     """
152 |     def __init__(self, writer, batch_size):
153 |         """
154 |         :param writer: writer with close() and add_scalar() methods
155 |         :param batch_size: integer size of batch to track
156 |         """
157 |         assert isinstance(batch_size, int)
158 |         assert writer is not None
159 |         self.writer = writer
160 |         self.batch_size = batch_size
161 | 
162 |     def __enter__(self):
163 |         self._batches = collections.defaultdict(list)
164 |         return self
165 | 
166 |     def __exit__(self, exc_type, exc_val, exc_tb):
167 |         self.writer.close()
168 | 
169 |     @staticmethod
170 |     def _as_float(value):
171 |         assert isinstance(value, (float, int, np.ndarray, np.generic, torch.autograd.Variable)) or torch.is_tensor(value)
172 |         tensor_val = None
173 |         if isinstance(value, torch.autograd.Variable):
174 |             tensor_val = value.data
175 |         elif torch.is_tensor(value):
176 |             tensor_val = value
177 | 
178 |         if tensor_val is not None:
179 |             return tensor_val.float().mean()
180 |         elif isinstance(value, np.ndarray):
181 |             return float(np.mean(value))
182 |         else:
183 |             return float(value)
184 | 
185 |     def track(self, param_name, value, iter_index):
186 |         assert isinstance(param_name, str)
187 |         assert isinstance(iter_index, int)
188 | 
189 |         data = self._batches[param_name]
190 |         data.append(self._as_float(value))
191 | 
192 |         if len(data) >= self.batch_size:
193 |             self.writer.add_scalar(param_name, np.mean(data), iter_index)
194 |             data.clear()
195 | 
196 | 
197 | class RewardTracker:
198 |     def __init__(self, writer, min_ts_diff=1.0):
199 |         """
200 |         Constructs RewardTracker
201 |         :param writer: writer to use for writing stats
202 |         :param min_ts_diff: minimal time difference to track speed
203 |         """
204 |         self.writer = writer
205 |         self.min_ts_diff = min_ts_diff
206 | 
207 |     def __enter__(self):
208 |         self.ts = time.time()
209 |         self.ts_frame = 0
210 |         self.total_rewards = []
211 |         return self
212 | 
213 |     def __exit__(self, *args):
214 |         self.writer.close()
215 | 
216 |     def reward(self, reward, frame, epsilon=None):
217 |         self.total_rewards.append(reward)
218 |         mean_reward = np.mean(self.total_rewards[-100:])
219 |         ts_diff = time.time() - self.ts
220 |         if ts_diff > self.min_ts_diff:
221 |             speed = (frame - self.ts_frame) / ts_diff
222 |             self.ts_frame = frame
223 |             self.ts = time.time()
224 |             epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon
225 |             print("%d: done %d episodes, mean reward %.3f, speed %.2f f/s%s" % (
226 |                 frame, len(self.total_rewards), mean_reward, speed, epsilon_str
227 |             ))
228 |             sys.stdout.flush()
229 |             self.writer.add_scalar("speed", speed, frame)
230 |         if epsilon is not None:
231 |             self.writer.add_scalar("epsilon", epsilon, frame)
232 |         self.writer.add_scalar("reward_100", mean_reward, frame)
233 |         self.writer.add_scalar("reward", reward, frame)
234 |         return mean_reward if len(self.total_rewards) > 30 else None
235 | 
236 | 


--------------------------------------------------------------------------------
/utils/Experience.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Experience Source.
  3 | Base models taken from the PTAN (PyTorch Agent Net) library by Shmuma
  4 | https://github.com/Shmuma/ptan
  5 | """
  6 | 
  7 | import gym
  8 | import collections
  9 | 
 10 | 
 11 | import numpy as np
 12 | 
 13 | from collections import namedtuple, deque
 14 | 
 15 | from utils.nn_agent_models import BaseAgent
 16 | 
 17 | # one single experience step
 18 | Experience = namedtuple('Experience', ['state', 'action', 'reward', 'done'])
 19 | 
 20 | 
 21 | class ExperienceSource:
 22 |     """
 23 |     Simple n-step experience source using single or multiple environments
 24 | 
 25 |     Every experience contains n list of Experience entries
 26 |     """
 27 |     def __init__(self, env, agent, steps_count=2, steps_delta=1, vectorized=False):
 28 |         """
 29 |         Create simple experience source
 30 |         :param env: environment or list of environments to be used
 31 |         :param agent: callable to convert batch of states into actions to take
 32 |         :param steps_count: count of steps to track for every experience chain
 33 |         :param steps_delta: how many steps to do between experience items
 34 |         :param vectorized: support of vectorized envs from OpenAI universe
 35 |         """
 36 |         assert isinstance(env, (gym.Env, list, tuple))
 37 |         assert isinstance(agent, BaseAgent)
 38 |         assert isinstance(steps_count, int)
 39 |         assert steps_count >= 1
 40 |         assert isinstance(vectorized, bool)
 41 |         if isinstance(env, (list, tuple)):
 42 |             self.pool = env
 43 |         else:
 44 |             self.pool = [env]
 45 |         self.agent = agent
 46 |         self.steps_count = steps_count
 47 |         self.steps_delta = steps_delta
 48 |         self.total_rewards = []
 49 |         self.total_steps = []
 50 |         self.vectorized = vectorized
 51 | 
 52 |     def __iter__(self):
 53 |         states, agent_states, histories, cur_rewards, cur_steps = [], [], [], [], []
 54 |         env_lens = []
 55 |         for env in self.pool:
 56 |             obs = env.reset()
 57 |             # if the environment is vectorized, all it's output is lists of results.
 58 |             # Details are here: https://github.com/openai/universe/blob/master/doc/env_semantics.rst
 59 |             if self.vectorized:
 60 |                 obs_len = len(obs)
 61 |                 states.extend(obs)
 62 |             else:
 63 |                 obs_len = 1
 64 |                 states.append(obs)
 65 |             env_lens.append(obs_len)
 66 | 
 67 |             for _ in range(obs_len):
 68 |                 histories.append(deque(maxlen=self.steps_count))
 69 |                 cur_rewards.append(0.0)
 70 |                 cur_steps.append(0)
 71 |                 agent_states.append(self.agent.initial_state())
 72 | 
 73 |         iter_idx = 0
 74 |         while True:
 75 |             actions = [None] * len(states)
 76 |             states_input = []
 77 |             states_indices = []
 78 |             for idx, state in enumerate(states):
 79 |                 if state is None:
 80 |                     actions[idx] = self.pool[0].action_space.sample()  # assume that all envs are from the same family
 81 |                 else:
 82 |                     states_input.append(state)
 83 |                     states_indices.append(idx)
 84 |             if states_input:
 85 |                 states_actions, new_agent_states = self.agent(states_input, agent_states)
 86 |                 for idx, action in enumerate(states_actions):
 87 |                     g_idx = states_indices[idx]
 88 |                     actions[g_idx] = action
 89 |                     agent_states[g_idx] = new_agent_states[idx]
 90 |             grouped_actions = _group_list(actions, env_lens)
 91 | 
 92 |             global_ofs = 0
 93 |             for env_idx, (env, action_n) in enumerate(zip(self.pool, grouped_actions)):
 94 |                 if self.vectorized:
 95 |                     next_state_n, r_n, is_done_n, _ = env.step(action_n)
 96 |                 else:
 97 |                     next_state, r, is_done, _ = env.step(action_n[0])
 98 |                     next_state_n, r_n, is_done_n = [next_state], [r], [is_done]
 99 | 
100 |                 for ofs, (action, next_state, r, is_done) in enumerate(zip(action_n, next_state_n, r_n, is_done_n)):
101 |                     idx = global_ofs + ofs
102 |                     state = states[idx]
103 |                     history = histories[idx]
104 | 
105 |                     cur_rewards[idx] += r
106 |                     cur_steps[idx] += 1
107 |                     if state is not None:
108 |                         history.append(Experience(state=state, action=action, reward=r, done=is_done))
109 |                     if len(history) == self.steps_count and iter_idx % self.steps_delta == 0:
110 |                         yield tuple(history)
111 |                     states[idx] = next_state
112 |                     if is_done:
113 |                         # generate tail of history
114 |                         while len(history) >= 1:
115 |                             yield tuple(history)
116 |                             history.popleft()
117 |                         self.total_rewards.append(cur_rewards[idx])
118 |                         self.total_steps.append(cur_steps[idx])
119 |                         cur_rewards[idx] = 0.0
120 |                         cur_steps[idx] = 0
121 |                         # vectorized envs are reset automatically
122 |                         states[idx] = env.reset() if not self.vectorized else None
123 |                         agent_states[idx] = self.agent.initial_state()
124 |                         history.clear()
125 |                 global_ofs += len(action_n)
126 |             iter_idx += 1
127 | 
128 |     def pop_total_rewards(self):
129 |         r = self.total_rewards
130 |         if r:
131 |             self.total_rewards = []
132 |             self.total_steps = []
133 |         return r
134 | 
135 |     def pop_rewards_steps(self):
136 |         res = list(zip(self.total_rewards, self.total_steps))
137 |         if res:
138 |             self.total_rewards, self.total_steps = [], []
139 |         return res
140 | 
141 | 
142 | def _group_list(items, lens):
143 |     """
144 |     Unflat the list of items by lens
145 |     :param items: list of items
146 |     :param lens: list of integers
147 |     :return: list of list of items grouped by lengths
148 |     """
149 |     res = []
150 |     cur_ofs = 0
151 |     for g_len in lens:
152 |         res.append(items[cur_ofs:cur_ofs+g_len])
153 |         cur_ofs += g_len
154 |     return res
155 | 
156 | 
157 | # those entries are emitted from ExperienceSourceFirstLast. Reward is discounted over the trajectory piece
158 | ExperienceFirstLast = collections.namedtuple('ExperienceFirstLast', ('state', 'action', 'reward', 'last_state'))
159 | 
160 | 
161 | class ExperienceSourceFirstLast(ExperienceSource):
162 |     """
163 |     This is a wrapper around ExperienceSource to prevent storing full trajectory in replay buffer when we need
164 |     only first and last states. For every trajectory piece it calculates discounted reward and emits only first
165 |     and last states and action taken in the first state.
166 | 
167 |     If we have partial trajectory at the end of episode, last_state will be None
168 |     """
169 |     def __init__(self, env, agent, gamma, steps_count=1, steps_delta=1, vectorized=False):
170 |         assert isinstance(gamma, float)
171 |         super(ExperienceSourceFirstLast, self).__init__(env, agent, steps_count+1, steps_delta, vectorized=vectorized)
172 |         self.gamma = gamma
173 |         self.steps = steps_count
174 | 
175 |     def __iter__(self):
176 |         for exp in super(ExperienceSourceFirstLast, self).__iter__():
177 |             if exp[-1].done and len(exp) <= self.steps:
178 |                 last_state = None
179 |                 elems = exp
180 |             else:
181 |                 last_state = exp[-1].state
182 |                 elems = exp[:-1]
183 |             total_reward = 0.0
184 |             for e in reversed(elems):
185 |                 total_reward *= self.gamma
186 |                 total_reward += e.reward
187 |             yield ExperienceFirstLast(state=exp[0].state, action=exp[0].action,
188 |                                       reward=total_reward, last_state=last_state)
189 | 
190 | 
191 | class ExperienceReplayBuffer:
192 |     def __init__(self, experience_source, buffer_size):
193 |         assert isinstance(experience_source, (ExperienceSource, type(None)))
194 |         assert isinstance(buffer_size, int)
195 |         self.experience_source_iter = None if experience_source is None else iter(experience_source)
196 |         self.buffer = []
197 |         self.capacity = buffer_size
198 |         self.pos = 0
199 | 
200 |     def __len__(self):
201 |         return len(self.buffer)
202 | 
203 |     def __iter__(self):
204 |         return iter(self.buffer)
205 | 
206 |     def sample(self, batch_size):
207 |         """
208 |         Get one random batch from experience replay
209 |         TODO: implement sampling order policy
210 |         :param batch_size:
211 |         :return:
212 |         """
213 |         if len(self.buffer) <= batch_size:
214 |             return self.buffer
215 |         # Warning: replace=False makes random.choice O(n)
216 |         keys = np.random.choice(len(self.buffer), batch_size, replace=True)
217 |         return [self.buffer[key] for key in keys]
218 | 
219 |     def _add(self, sample):
220 |         if len(self.buffer) < self.capacity:
221 |             self.buffer.append(sample)
222 |         else:
223 |             self.buffer[self.pos] = sample
224 |             self.pos = (self.pos + 1) % self.capacity
225 | 
226 |     def populate(self, samples):
227 |         """
228 |         Populates samples into the buffer
229 |         :param samples: how many samples to populate
230 |         """
231 |         for _ in range(samples):
232 |             entry = next(self.experience_source_iter)
233 |             self._add(entry)
234 | 


--------------------------------------------------------------------------------
/utils/srl_algorithms.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file implements the SRL algorithms.
  3 | Author: Tal Daniel
  4 | """
  5 | 
  6 | # imports
  7 | import torch
  8 | import utils.utils as utils
  9 | import copy
 10 | 
 11 | 
 12 | def calc_fqi_matrices(nets, tgt_nets, batch, gamma, n_srl, m_batch_size=512, device='cpu',
 13 |                       use_boosting=False, train_actor=False):
 14 |     """
 15 |     This function calculates A and b tensors for the FQI solution.
 16 |     :param batch: batch of samples to extract features from (list)
 17 |     :param nets: networks to extract features from (nn.Module)
 18 |     :param tgt_nets: target networks from which Q values of next states are calculated (nn.Module)
 19 |     :param gamma: discount factor (float)
 20 |     :param n_srl: number of samples to include in the FQI solution
 21 |     :param m_batch_size: number of samples to calculate simultaneously (int)
 22 |     :param device: on which device to perform the calculation (cpu/gpu)
 23 |     :param use_boosting: whether or not to use Boosted FQI
 24 |     :param: train_actor: whether or not to train actor net (bool)
 25 |     :return: A, A_bias, b, b_bias parameters for calculating the LS (np.arrays)
 26 |     """
 27 |     num_batches = n_srl // m_batch_size
 28 |     act_net, crt_net = nets
 29 |     tgt_act_net, tgt_crt_net = tgt_nets
 30 |     dim_act = act_net.fc2.out_features
 31 |     dim_crt = crt_net.out_fc1.out_features
 32 |     num_actions = act_net.fc3.out_features
 33 | 
 34 |     if train_actor:
 35 |         A_act = torch.zeros([dim_act * num_actions, dim_act * num_actions], dtype=torch.float32).to(device)
 36 |         A_act_bias = torch.zeros([1 * num_actions, 1 * num_actions], dtype=torch.float32).to(device)
 37 |         b_act = torch.zeros([dim_act * num_actions, 1], dtype=torch.float32).to(device)
 38 |         b_act_bias = torch.zeros([1 * num_actions, 1], dtype=torch.float32).to(device)
 39 | 
 40 |     A_crt = torch.zeros([dim_crt * 1, dim_crt * 1], dtype=torch.float32).to(device)
 41 |     A_crt_bias = torch.zeros([1 * 1, 1 * 1], dtype=torch.float32).to(device)
 42 |     b_crt = torch.zeros([dim_crt * 1, 1], dtype=torch.float32).to(device)
 43 |     b_crt_bias = torch.zeros([1 * 1, 1], dtype=torch.float32).to(device)
 44 | 
 45 |     for i in range(num_batches):
 46 |         idx = i * m_batch_size
 47 |         if i == num_batches - 1:
 48 |             states_v, actions_v, rewards_v, dones_mask, last_states_v = utils.unpack_batch(batch[idx:], device)
 49 |         else:
 50 |             states_v, actions_v, rewards_v, dones_mask, last_states_v = utils.unpack_batch(
 51 |                 batch[idx: idx + m_batch_size], device)
 52 |         if train_actor:
 53 |             states_features_act = act_net.forward_to_last_hidden(states_v)
 54 |         states_features_crt = crt_net.forward_to_last_hidden(states_v, actions_v)
 55 | 
 56 |         # augmentation
 57 |         states_features_crt_bias = torch.ones([states_features_crt.shape[0], 1 * 1],
 58 |                                               dtype=torch.float32).to(device)
 59 |         if train_actor:
 60 |             states_features_act_aug = states_features_act.detach().repeat(
 61 |                 (1, num_actions)).to(
 62 |                 device)
 63 |             states_features_act_bias_aug = torch.ones([states_features_act.shape[0], 1 * num_actions],
 64 |                                                       dtype=torch.float32).to(device)
 65 | 
 66 |             states_features_act_mat = torch.mm(torch.t(states_features_act_aug), states_features_act_aug)
 67 |             states_features_act_bias_mat = torch.mm(torch.t(states_features_act_bias_aug), states_features_act_bias_aug)
 68 | 
 69 |         states_features_crt_mat = torch.mm(torch.t(states_features_crt.detach()), states_features_crt.detach())
 70 |         states_features_crt_bias_mat = torch.mm(torch.t(states_features_crt_bias), states_features_crt_bias)
 71 | 
 72 |         q_v = crt_net(states_v, actions_v)
 73 |         last_act_v = tgt_act_net.target_model(last_states_v)
 74 |         q_last_v = tgt_crt_net.target_model(last_states_v, last_act_v)
 75 |         q_last_v[dones_mask] = 0.0
 76 |         q_ref_v = rewards_v.unsqueeze(dim=-1) + q_last_v * gamma  # y_i
 77 | 
 78 |         if use_boosting:
 79 |             # calculate truncated bellman error
 80 |             bellman_error = q_ref_v.detach() - q_v.detach()
 81 |             truncated_bellman_error = bellman_error.clamp(-1, 1)
 82 | 
 83 |             if train_actor:
 84 |                 b_act += torch.mm(torch.t(states_features_act_aug.detach()),
 85 |                                   truncated_bellman_error.detach().view(-1, 1))
 86 |                 b_act_bias += torch.mm(torch.t(states_features_act_bias_aug),
 87 |                                        truncated_bellman_error.detach().view(-1, 1))
 88 | 
 89 |             b_crt += torch.mm(torch.t(states_features_crt.detach()),
 90 |                               truncated_bellman_error.detach().view(-1, 1))
 91 |             b_crt_bias += torch.mm(torch.t(states_features_crt_bias),
 92 |                                    truncated_bellman_error.detach().view(-1, 1))
 93 |         else:
 94 |             if train_actor:
 95 |                 b_act += torch.mm(torch.t(states_features_act_aug.detach()),
 96 |                                   q_ref_v.detach().view(-1, 1))
 97 |                 b_act_bias += torch.mm(torch.t(states_features_act_bias_aug),
 98 |                                        q_ref_v.detach().view(-1, 1))
 99 | 
100 |             b_crt += torch.mm(torch.t(states_features_crt.detach()),
101 |                               q_ref_v.detach().view(-1, 1))
102 |             b_crt_bias += torch.mm(torch.t(states_features_crt_bias),
103 |                                    q_ref_v.detach().view(-1, 1))
104 |         if train_actor:
105 |             A_act += states_features_act_mat.detach()
106 |             A_act_bias += states_features_act_bias_mat
107 |         A_crt += states_features_crt_mat.detach()
108 |         A_crt_bias += states_features_crt_bias_mat
109 |     if train_actor:
110 |         A_act = (1.0 / n_srl) * A_act
111 |         A_act_bias = (1.0 / n_srl) * A_act_bias
112 |         b_act = (1.0 / n_srl) * b_act
113 |         b_act_bias = (1.0 / n_srl) * b_act_bias
114 | 
115 |     A_crt = (1.0 / n_srl) * A_crt
116 |     A_crt_bias = (1.0 / n_srl) * A_crt_bias
117 |     b_crt = (1.0 / n_srl) * b_crt
118 |     b_crt_bias = (1.0 / n_srl) * b_crt_bias
119 | 
120 |     if train_actor:
121 |         return A_act, A_act_bias, b_act, b_act_bias, A_crt, A_crt_bias, b_crt, b_crt_bias
122 |     else:
123 |         return A_crt, A_crt_bias, b_crt, b_crt_bias
124 | 
125 | 
126 | def calc_fqi_w_srl(a, a_bias, b, b_bias, w, w_b, lam=1.0, device='cpu'):
127 |     """
128 |     This function calculates the closed-form solution of the DQI algorithm.
129 |     :param a: A matrix built from features (np.array)
130 |     :param a_bias: same, but for bias
131 |     :param b: b vector built from features and rewards (np.array)
132 |     :param b_bias: same, but for bias
133 |     :param w: weights of the last hidden layer in the DQN (np.array)
134 |     :param w_b: bias weights
135 |     :param lam: regularization parameter for the Least-Square (float)
136 |     :param device: on which device to perform the calculation (cpu/gpu)
137 |     :return: w_srl: retrained weights using FQI closed-form solution (np.array)
138 |     """
139 |     num_actions = w.shape[0]
140 |     dim = w.shape[1]
141 |     w = w.view(-1, 1)
142 |     w_b = w_b.view(-1, 1)
143 |     w_srl = torch.mm(torch.inverse(a + lam * torch.eye(num_actions * dim).to(device)), b + lam * w.detach())
144 |     w_b_srl = torch.mm(torch.inverse(a_bias + lam * torch.eye(num_actions * 1).to(device)), b_bias + lam * w_b.detach())
145 |     return w_srl.view(num_actions, dim), w_b_srl.squeeze()
146 | 
147 | 
148 | def ls_step(nets, tgt_nets, batch, gamma, n_srl, lam=1.0, m_batch_size=256, device='cpu', use_boosting=False,
149 |             sync_tgt=False):
150 |     """
151 |     This function performs the least-squares update on the last hidden layer weights.
152 |     :param batch: batch of samples to extract features from (list)
153 |     :param nets: networks to extract features from (nn.Module)
154 |     :param tgt_nets: target networks from which Q values of next states are calculated (nn.Module)
155 |     :param gamma: discount factor (float)
156 |     :param n_srl: number of samples to include in the FQI solution
157 |     :param lam: regularization parameter for the Least-Square (float)
158 |     :param m_batch_size: number of samples to calculate simultaneously (int)
159 |     :param device: on which device to perform the calculation (cpu/gpu)
160 |     :param use_boosting: whether or not to use Boosted FQI
161 |     :param sync_tgt: whether or not to sync target networks (bool)
162 |     :return:
163 |     """
164 |     train_actor = False
165 |     act_net, crt_net = nets
166 |     tgt_act_net, tgt_crt_net = tgt_nets
167 |     a_b_s = calc_fqi_matrices(nets, tgt_nets, batch, gamma,
168 |                               n_srl, m_batch_size=m_batch_size, device=device, use_boosting=use_boosting,
169 |                               train_actor=train_actor
170 |                               )
171 |     if train_actor:
172 |         a_act, a_act_bias, b_act, b_act_bias, a_crt, a_crt_bias, b_crt, b_crt_bias = a_b_s
173 |     else:
174 |         a_crt, a_crt_bias, b_crt, b_crt_bias = a_b_s
175 | 
176 |     if train_actor:
177 |         w_act_last_dict = copy.deepcopy(act_net.fc3.state_dict())
178 |         w_act_last_dict_before = copy.deepcopy(act_net.fc3.state_dict())
179 |         w_act_srl, w_b_act_srl = calc_fqi_w_srl(a_act.detach(), a_act_bias.detach(), b_act.detach(),
180 |                                                 b_act_bias.detach(),
181 |                                                 w_act_last_dict['weight'], w_act_last_dict['bias'], lam=lam,
182 |                                                 device=device)
183 | 
184 |     w_crt_last_dict = copy.deepcopy(crt_net.out_fc2.state_dict())
185 |     w_crt_last_dict_before = copy.deepcopy(crt_net.out_fc2.state_dict())
186 |     w_crt_srl, w_b_crt_srl = calc_fqi_w_srl(a_crt.detach(), a_crt_bias.detach(), b_crt.detach(),
187 |                                             b_crt_bias.detach(),
188 |                                             w_crt_last_dict['weight'], w_crt_last_dict['bias'], lam=lam,
189 |                                             device=device)
190 | 
191 |     if train_actor:
192 |         w_act_last_dict['weight'] = w_act_srl.detach()
193 |         w_act_last_dict['bias'] = w_b_act_srl.detach()
194 |         act_net.fc3.load_state_dict(w_act_last_dict)
195 | 
196 |         weight_diff_act = torch.sum((w_act_last_dict['weight'] - w_act_last_dict_before['weight']) ** 2)
197 |         bias_diff_act = torch.sum((w_act_last_dict['bias'] - w_act_last_dict_before['bias']) ** 2)
198 |         total_weight_diff_act = torch.sqrt(weight_diff_act + bias_diff_act)
199 | 
200 |     w_crt_last_dict['weight'] = w_crt_srl.detach()
201 |     w_crt_last_dict['bias'] = w_b_crt_srl.detach().unsqueeze(-1)
202 |     crt_net.out_fc2.load_state_dict(w_crt_last_dict)
203 |     # weight diff
204 |     weight_diff_crt = torch.sum((w_crt_last_dict['weight'] - w_crt_last_dict_before['weight']) ** 2)
205 |     bias_diff_crt = torch.sum((w_crt_last_dict['bias'] - w_crt_last_dict_before['bias']) ** 2)
206 |     total_weight_diff_crt = torch.sqrt(weight_diff_crt + bias_diff_crt)
207 | 
208 |     if sync_tgt:
209 |         if train_actor:
210 |             tgt_act_net.alpha_sync(alpha=1 - 1e-3)
211 |         tgt_crt_net.alpha_sync(alpha=1 - 1e-3)
212 | 
213 |     if train_actor:
214 |         print("total weight difference of ls-update:: actor: %.3f" % total_weight_diff_act.item(),
215 |               " critic: %.3f" % total_weight_diff_crt.item())
216 |     else:
217 |         print("total weight difference of ls-update:: critic: %.3f" % total_weight_diff_crt.item())
218 |     print("least-squares step done.")
219 | 


--------------------------------------------------------------------------------
/ls_ddpg_main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Main application for Least-Squares Deep Deterministic Policy Gradients
  3 | """
  4 | 
  5 | import os
  6 | import time
  7 | import gym
  8 | import pybullet_envs
  9 | from tensorboardX import SummaryWriter
 10 | import numpy as np
 11 | import utils.nn_agent_models as agent_model
 12 | import utils.Experience as Experience
 13 | import utils.utils as utils
 14 | from utils.srl_algorithms import ls_step
 15 | import torch
 16 | import torch.optim as optim
 17 | import torch.nn.functional as F
 18 | import random
 19 | from utils.utils import test_net
 20 | import argparse
 21 | 
 22 | REWARD_TO_SOLVE = 300  # mean reward the environment is considered SOLVED
 23 | 
 24 | if __name__ == "__main__":
 25 |     parser = argparse.ArgumentParser(description="train and play an LS-DQN agent")
 26 |     # modes
 27 |     parser.add_argument("-t", "--train", help="train or continue training an agent",
 28 |                         action="store_true")
 29 |     parser.add_argument("-k", "--lsddpg", help="use LS-DDPG",
 30 |                         action="store_true")
 31 |     parser.add_argument("-j", "--boosting", help="use boosting",
 32 |                         action="store_true")
 33 |     parser.add_argument("-p", "--play", help="play the environment using an a pretrained agent",
 34 |                         action="store_true")
 35 |     parser.add_argument("-y", "--path", type=str, help="path to agent checkpoint, for playing")
 36 |     # arguments
 37 |     # for training and playing
 38 |     parser.add_argument("-n", "--name", type=str,
 39 |                         help="model name, for saving and loading,"
 40 |                              " if not set, training will continue from a pretrained checkpoint")
 41 |     parser.add_argument("-e", "--env", type=str,
 42 |                         help="environment to play: MinitaurBulletEnv-v0, BipedalWalker-v2", default="BipedalWalker-v2")
 43 |     # for training
 44 |     parser.add_argument("-d", "--decay_rate", type=int,
 45 |                         help="number of episodes for epsilon decaying, default: 500000")
 46 |     parser.add_argument("-o", "--optimizer", type=str,
 47 |                         help="optimizing algorithm ('RMSprop', 'Adam'), deafult: 'Adam'")
 48 |     parser.add_argument("--lr_actor", type=float,
 49 |                         help="learning rate for the Actor optimizer, default: 0.0001")
 50 |     parser.add_argument("--lr_critic", type=float,
 51 |                         help="learning rate for the Critic optimizer, default: 0.0001")
 52 |     parser.add_argument("-l", "--lam", type=float,
 53 |                         help="regularization parameter value, default: 1, 10000 (boosting)")
 54 |     parser.add_argument("-g", "--gamma", type=float,
 55 |                         help="gamma parameter for the Q-Learning, default: 0.99")
 56 |     parser.add_argument("-s", "--buffer_size", type=int,
 57 |                         help="Replay Buffer size, default: 1000000")
 58 |     parser.add_argument("-a", "--n_drl", type=int,
 59 |                         help="number of drl updates before an srl update, default: 100000")
 60 |     parser.add_argument("-b", "--batch_size", type=int,
 61 |                         help="number of samples in each batch, default: 64")
 62 |     parser.add_argument("-i", "--steps_to_start_learn", type=int,
 63 |                         help="number of steps before the agents starts learning, default: 10000")
 64 |     parser.add_argument("-c", "--test_iter", type=int,
 65 |                         help="number of iterations between policy testing, default: 10000")
 66 |     # for playing
 67 |     parser.add_argument("-x", "--record", help="Directory to store video recording")
 68 |     parser.add_argument("--no-visualize", default=True, action='store_false', dest='visualize',
 69 |                         help="Disable visualization of the game play")
 70 | 
 71 |     args = parser.parse_args()
 72 |     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 73 |     if args.lsddpg:
 74 |         use_ls_ddpg = True
 75 |     else:
 76 |         use_ls_ddpg = False
 77 |     if args.boosting:
 78 |         use_boosting = True
 79 |         lam = 1000
 80 |     else:
 81 |         use_boosting = False
 82 |         lam = 1
 83 | 
 84 |     # Training
 85 |     if args.train:
 86 |         if args.name:
 87 |             model_name = args.name
 88 |         else:
 89 |             model_name = ''
 90 |         if args.decay_rate:
 91 |             decay_rate = args.decay_rate
 92 |         else:
 93 |             decay_rate = None
 94 |         if args.lr_actor:
 95 |             lr_actor = args.lr_actor
 96 |         else:
 97 |             lr_actor = 0.0001
 98 |         if args.lr_critic:
 99 |             lr_critic = args.lr_critic
100 |         else:
101 |             lr_critic = 0.0001
102 |         if args.lam:
103 |             lam = args.lam
104 |         if args.gamma:
105 |             gamma = args.gamma
106 |         else:
107 |             gamma = 0.99
108 |         if args.buffer_size:
109 |             replay_size = args.buffer_size
110 |         else:
111 |             replay_size = 100000
112 |         if args.n_drl:
113 |             n_drl = args.n_drl
114 |         else:
115 |             n_drl = 100000  # steps of DRL between SRL
116 |         if args.batch_size:
117 |             batch_size = args.batch_size
118 |         else:
119 |             batch_size = 64
120 |         if args.steps_to_start_learn:
121 |             steps_to_start_learn = args.steps_to_start_learn
122 |         else:
123 |             steps_to_start_learn = 10000
124 |         if args.test_iter:
125 |             test_iter = args.test_iter
126 |         else:
127 |             test_iter = 10000
128 | 
129 |         # training_random_seed = 2019
130 |         save_freq = 50000
131 |         n_srl = replay_size  # size of batch in SRL step
132 |         # use_constant_seed = False  # to compare performance independently of the randomness
133 | 
134 |         model_saving_path = './agent_ckpt/agent_' + model_name + ".pth"
135 |         # if use_constant_seed:
136 |         #     model_name += "-SEED-" + str(training_random_seed)
137 |         #     np.random.seed(training_random_seed)
138 |         #     random.seed(training_random_seed)
139 |         #     env.seed(training_random_seed)
140 |         #     torch.manual_seed(training_random_seed)
141 |         #     if torch.cuda.is_available():
142 |         #         torch.cuda.manual_seed_all(training_random_seed)
143 |         #     print("training using constant seed of ", training_random_seed)
144 |         env = gym.make(args.env)
145 |         test_env = gym.make(args.env)
146 |         name = model_name + "_agent_" + args.env
147 |         if use_ls_ddpg:
148 |             print("using LS-DDPG")
149 |             name += "-LS-LAM-" + str(lam) + "-" + str(int(1.0 * n_drl / 1000)) + "K"
150 |         if use_boosting:
151 |             print("using boosting")
152 |             name += "-BOOSTING"
153 |         # if use_constant_seed:
154 |         #     name += "-SEED-" + str(training_random_seed)
155 |         #     np.random.seed(training_random_seed)
156 |         #     random.seed(training_random_seed)
157 |         #     env.seed(training_random_seed)
158 |         #     test_env.seed(training_random_seed)
159 |         #     torch.manual_seed(training_random_seed)
160 |         #     if torch.cuda.is_available():
161 |         #         torch.cuda.manual_seed_all(training_random_seed)
162 |         #     print("training using constant seed of ", training_random_seed)
163 |         name += "-BATCH-" + str(batch_size)
164 |         save_path = os.path.join("saves", "ddpg-" + name)
165 |         os.makedirs(save_path, exist_ok=True)
166 |         ckpt_save_path = './agent_ckpt/' + name + ".pth"
167 |         if not os.path.exists('./agent_ckpt/'):
168 |             os.makedirs('./agent_ckpt')
169 | 
170 |         act_net = agent_model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device)
171 |         crt_net = agent_model.DDPGCritic(env.observation_space.shape[0], env.action_space.shape[0]).to(device)
172 |         print(act_net)
173 |         print(crt_net)
174 |         tgt_act_net = agent_model.TargetNet(act_net)
175 |         tgt_crt_net = agent_model.TargetNet(crt_net)
176 | 
177 |         writer = SummaryWriter(comment="-ddpg-" + name)
178 |         if decay_rate is not None:
179 |             agent = agent_model.AgentDDPG(act_net, device=device, ou_decay_steps=decay_rate)
180 |         else:
181 |             agent = agent_model.AgentDDPG(act_net, device=device)
182 |         exp_source = Experience.ExperienceSourceFirstLast(env, agent, gamma=gamma, steps_count=1)
183 |         buffer = Experience.ExperienceReplayBuffer(exp_source, buffer_size=replay_size)
184 |         if args.optimizer and args.optimizer == "RMSprop":
185 |             act_opt = optim.RMSprop(act_net.parameters(), lr=lr_actor)
186 |             crt_opt = optim.RMSprop(crt_net.parameters(), lr=lr_critic)
187 |         else:
188 |             act_opt = optim.Adam(act_net.parameters(), lr=lr_actor)
189 |             crt_opt = optim.Adam(crt_net.parameters(), lr=lr_critic)
190 | 
191 |         utils.load_agent_state(act_net, crt_net, [act_opt, crt_opt], path=ckpt_save_path)
192 | 
193 |         frame_idx = 0
194 |         drl_updates = 0
195 |         best_reward = None
196 |         with utils.RewardTracker(writer) as tracker:
197 |             with utils.TBMeanTracker(writer, batch_size=10) as tb_tracker:
198 |                 while True:
199 |                     frame_idx += 1
200 |                     buffer.populate(1)
201 |                     rewards_steps = exp_source.pop_rewards_steps()
202 |                     if rewards_steps:
203 |                         rewards, steps = zip(*rewards_steps)
204 |                         tb_tracker.track("episode_steps", steps[0], frame_idx)
205 |                         mean_reward = tracker.reward(rewards[0], frame_idx)
206 |                         if mean_reward is not None and mean_reward > REWARD_TO_SOLVE:
207 |                             print("environment solved in % steps" % frame_idx,
208 |                                   " (% episodes)" % len(tracker.total_rewards))
209 |                             utils.save_agent_state(act_net, crt_net, [act_opt, crt_opt], frame_idx,
210 |                                                    len(tracker.total_rewards), path=ckpt_save_path)
211 |                             break
212 | 
213 |                     if len(buffer) < steps_to_start_learn:
214 |                         continue
215 | 
216 |                     batch = buffer.sample(batch_size)
217 |                     states_v, actions_v, rewards_v, dones_mask, last_states_v = utils.unpack_batch(batch, device)
218 | 
219 |                     # train critic
220 |                     crt_opt.zero_grad()
221 |                     q_v = crt_net(states_v, actions_v)
222 |                     last_act_v = tgt_act_net.target_model(last_states_v)
223 |                     q_last_v = tgt_crt_net.target_model(last_states_v, last_act_v)
224 |                     q_last_v[dones_mask] = 0.0
225 |                     q_ref_v = rewards_v.unsqueeze(dim=-1) + q_last_v * gamma
226 |                     critic_loss_v = F.mse_loss(q_v, q_ref_v.detach())
227 |                     critic_loss_v.backward()
228 |                     crt_opt.step()
229 |                     tb_tracker.track("loss_critic", critic_loss_v, frame_idx)
230 |                     tb_tracker.track("critic_ref", q_ref_v.mean(), frame_idx)
231 | 
232 |                     drl_updates += 1
233 |                     # LS-UPDATE STEP for Critic (Q)
234 |                     if use_ls_ddpg and (drl_updates % n_drl == 0) and (len(buffer) >= n_srl):
235 |                         # if len(buffer) > 1:
236 |                         print("performing ls step...")
237 |                         batch = buffer.sample(n_srl)
238 |                         ls_step([act_net, crt_net], [tgt_act_net, tgt_crt_net], batch, gamma, len(buffer),
239 |                                 lam=lam, m_batch_size=256, device=device, use_boosting=use_boosting)
240 | 
241 |                     # train actor
242 |                     act_opt.zero_grad()
243 |                     cur_actions_v = act_net(states_v)
244 |                     actor_loss_v = -crt_net(states_v, cur_actions_v)
245 |                     actor_loss_v = actor_loss_v.mean()
246 |                     actor_loss_v.backward()
247 |                     act_opt.step()
248 |                     tb_tracker.track("loss_actor", actor_loss_v, frame_idx)
249 | 
250 |                     tgt_act_net.alpha_sync(alpha=1 - 1e-3)
251 |                     tgt_crt_net.alpha_sync(alpha=1 - 1e-3)
252 | 
253 |                     if frame_idx % test_iter == 0:
254 |                         ts = time.time()
255 |                         rewards, steps = test_net(act_net, test_env, agent_model, device=device)
256 |                         print("Test done in %.2f sec, reward %.3f, steps %d" % (
257 |                             time.time() - ts, rewards, steps))
258 |                         writer.add_scalar("test_reward", rewards, frame_idx)
259 |                         writer.add_scalar("test_steps", steps, frame_idx)
260 |                         if best_reward is None or best_reward < rewards:
261 |                             if best_reward is not None:
262 |                                 print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards))
263 |                                 name = "best_%+.3f_%d.dat" % (rewards, frame_idx)
264 |                                 fname = os.path.join(save_path, name)
265 |                                 torch.save(act_net.state_dict(), fname)
266 |                                 utils.save_agent_state(act_net, crt_net, [act_opt, crt_opt], frame_idx,
267 |                                                        len(tracker.total_rewards), path=ckpt_save_path)
268 |                             best_reward = rewards
269 | 
270 |         pass
271 |     elif args.play:
272 |         # play
273 |         if args.path:
274 |             path_to_model_ckpt = args.path
275 |         else:
276 |             raise SystemExit("must include path to agent checkpoint")
277 |         render = True
278 |         spec = gym.envs.registry.spec(args.env)
279 |         if spec._kwargs.get('render') and render:
280 |             spec._kwargs['render'] = True
281 |         env = gym.make(args.env)
282 |         use_constant_seed = True
283 |         seed = 2019
284 |         if use_constant_seed:
285 |             np.random.seed(seed)
286 |             random.seed(seed)
287 |             env.seed(seed)
288 |             torch.manual_seed(seed)
289 |             if torch.cuda.is_available():
290 |                 torch.cuda.manual_seed_all(seed)
291 |             print("seed set to ", seed)
292 |         if args.record:
293 |             # pass
294 |             env = gym.wrappers.Monitor(env, args.record)
295 | 
296 |         net = agent_model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0])
297 |         net.load_state_dict(torch.load(path_to_model_ckpt))
298 | 
299 |         obs = env.reset()
300 |         total_reward = 0.0
301 |         total_steps = 0
302 |         while True:
303 |             obs_v = torch.FloatTensor([obs])
304 |             mu_v = net(obs_v)
305 |             action = mu_v.squeeze(dim=0).data.numpy()
306 |             action = np.clip(action, -1, 1)
307 |             obs, reward, done, _ = env.step(action)
308 |             total_reward += reward
309 |             total_steps += 1
310 |             if render:
311 |                 env.render()
312 |             if done:
313 |                 env.close()
314 |                 break
315 |         print("In %d steps we got %.3f reward" % (total_steps, total_reward))
316 |     else:
317 |         raise SystemExit("must choose between train or play")
318 | 
319 | 


--------------------------------------------------------------------------------