├── rl_utils
    ├── __init__.py
    ├── logger
    │   ├── __init__.py
    │   ├── plot.py
    │   ├── bench.py
    │   └── logger.py
    ├── mpi_utils
    │   ├── __init__.py
    │   ├── utils.py
    │   └── normalizer.py
    ├── running_filter
    │   ├── __init__.py
    │   └── running_filter.py
    ├── seeds
    │   └── seeds.py
    ├── env_wrapper
    │   ├── frame_stack.py
    │   ├── create_env.py
    │   ├── multi_envs_wrapper.py
    │   ├── __init__.py
    │   └── atari_wrapper.py
    └── experience_replay
    │   └── experience_replay.py
├── figures
    ├── logo.png
    ├── 01_dqn.png
    ├── 03_a2c.png
    ├── 05_ppo.png
    ├── 06_sac.png
    ├── hopper.gif
    ├── 02_ddpg.png
    ├── 04_trpo.png
    ├── bipedal.gif
    └── breakout.gif
├── rl_algorithms
    ├── trpo
    │   ├── README.md
    │   ├── train.py
    │   ├── demo.py
    │   ├── models.py
    │   ├── arguments.py
    │   ├── utils.py
    │   └── trpo_agent.py
    ├── a2c
    │   ├── README.md
    │   ├── train.py
    │   ├── utils.py
    │   ├── demo.py
    │   ├── arguments.py
    │   ├── models.py
    │   └── a2c_agent.py
    ├── sac
    │   ├── README.md
    │   ├── train.py
    │   ├── demo.py
    │   ├── models.py
    │   ├── utils.py
    │   ├── arguments.py
    │   └── sac_agent.py
    ├── ddpg
    │   ├── README.md
    │   ├── utils.py
    │   ├── train.py
    │   ├── models.py
    │   ├── demo.py
    │   ├── arguments.py
    │   └── ddpg_agent.py
    ├── dqn_algos
    │   ├── README.md
    │   ├── train.py
    │   ├── demo.py
    │   ├── utils.py
    │   ├── arguments.py
    │   ├── models.py
    │   └── dqn_agent.py
    └── ppo
    │   ├── README.md
    │   ├── train.py
    │   ├── utils.py
    │   ├── arguments.py
    │   ├── demo.py
    │   ├── models.py
    │   └── ppo_agent.py
├── setup.py
├── .gitignore
└── README.md


/rl_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_utils/logger/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_utils/mpi_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_utils/running_filter/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/logo.png


--------------------------------------------------------------------------------
/figures/01_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/01_dqn.png


--------------------------------------------------------------------------------
/figures/03_a2c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/03_a2c.png


--------------------------------------------------------------------------------
/figures/05_ppo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/05_ppo.png


--------------------------------------------------------------------------------
/figures/06_sac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/06_sac.png


--------------------------------------------------------------------------------
/figures/hopper.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/hopper.gif


--------------------------------------------------------------------------------
/figures/02_ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/02_ddpg.png


--------------------------------------------------------------------------------
/figures/04_trpo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/04_trpo.png


--------------------------------------------------------------------------------
/figures/bipedal.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/bipedal.gif


--------------------------------------------------------------------------------
/figures/breakout.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/breakout.gif


--------------------------------------------------------------------------------
/rl_algorithms/trpo/README.md:
--------------------------------------------------------------------------------
 1 | # Trust Region Policy Gradient (TRPO)
 2 | ## Instructions
 3 | 1. Train the agents (GPU is not supported):
 4 | ```bash
 5 | python train.py --env-name='<env-name>'
 6 | ```
 7 | 2. Play the demo:
 8 | ```bash
 9 | python demo.py --env-name='<env name>'
10 | ```
11 | ## Results
12 | ![](../../figures/04_trpo.png)
13 | 


--------------------------------------------------------------------------------
/rl_algorithms/a2c/README.md:
--------------------------------------------------------------------------------
 1 | # Synchronous Advantage Actor-Critic (A2C)
 2 | ## Instructions
 3 | 1. Train the agents:
 4 | ```bash
 5 | python train.py --env-name='<env-name>' --cuda (if you have a GPU)
 6 | ```
 7 | 2. Play the demo:
 8 | ```bash
 9 | python demo.py --env-name='<env name>'
10 | ```
11 | ## Results
12 | ![](../../figures/03_a2c.png)
13 | 


--------------------------------------------------------------------------------
/rl_algorithms/sac/README.md:
--------------------------------------------------------------------------------
 1 | # Soft Actor-Critic (SAC)
 2 | ## Instructions
 3 | 1. Train the agents:
 4 | ```bash
 5 | python train.py --env-name='<env name>' --cuda (if you have a GPU) --<other flags>
 6 | ```
 7 | 2. Play the demo:
 8 | ```bash
 9 | python demo.py --env-name='<env name>'
10 | ```
11 | ## Results
12 | ![](../../figures/06_sac.png)
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | """
 4 | install the packages
 5 | 
 6 | """
 7 | setup(name='rl_utils',
 8 |       version='0.0',
 9 |       description='rl utils for the rl algorithms',
10 |       author='Tianhong Dai',
11 |       author_email='xxx@xxx.com',
12 |       url='no',
13 |       packages=['rl_utils'],
14 |      )
15 | 


--------------------------------------------------------------------------------
/rl_algorithms/ddpg/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Deterministic Policy Gradient (DDPG)
 2 | ## Instructions
 3 | 1. Train the agents (GPU is not supported, will support it in the future):
 4 | ```bash
 5 | mpirun -np 1 python -u train.py --env-name='<env name>' --<other-flags> 2>&1 | tee exp_ddpg.log
 6 | ```
 7 | 2. Play the demo:
 8 | ```bash
 9 | python demo.py --env-name='<env name>'
10 | ```
11 | ## Results
12 | ![](../../figures/02_ddpg.png)
13 | 


--------------------------------------------------------------------------------
/rl_utils/seeds/seeds.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | import torch
 4 | 
 5 | # set random seeds for the pytorch, numpy and random
 6 | def set_seeds(args, rank=0):
 7 |     # set seeds for the numpy
 8 |     np.random.seed(args.seed + rank)
 9 |     # set seeds for the random.random
10 |     random.seed(args.seed + rank)
11 |     # set seeds for the pytorch
12 |     torch.manual_seed(args.seed + rank)
13 |     if args.cuda:
14 |         torch.cuda.manual_seed(args.seed + rank)
15 | 


--------------------------------------------------------------------------------
/rl_algorithms/dqn_algos/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Q Networks (DQN)
 2 | ## Instructions
 3 | 1. Train the agents, can use flag `--use-dueling` and `--use-double-net` to try the Double DQN or Dueling Network Architecture:
 4 | ```bash
 5 | python train.py --env-name='<env name>' --cuda (if you have a GPU) --<other flags>
 6 | ```
 7 | 2. Play the demo - Please use the same algorithm flag as training:
 8 | ```bash
 9 | python demo.py --env-name='<env name>' --<algo flags>
10 | ```
11 | ## Results
12 | ![](../../figures/01_dqn.png)
13 | 


--------------------------------------------------------------------------------
/rl_algorithms/sac/train.py:
--------------------------------------------------------------------------------
 1 | from arguments import get_args
 2 | from sac_agent import sac_agent
 3 | from rl_utils.seeds.seeds import set_seeds
 4 | from rl_utils.env_wrapper.create_env import create_single_env
 5 | 
 6 | if __name__ == '__main__':
 7 |     args = get_args()
 8 |     # build the environment
 9 |     env = create_single_env(args)
10 |     # set the seeds
11 |     set_seeds(args)
12 |     # create the agent
13 |     sac_trainer = sac_agent(env, args)
14 |     sac_trainer.learn()
15 |     # close the environment
16 |     env.close()
17 | 


--------------------------------------------------------------------------------
/rl_algorithms/trpo/train.py:
--------------------------------------------------------------------------------
 1 | from arguments import get_args
 2 | from rl_utils.seeds.seeds import set_seeds
 3 | from rl_utils.env_wrapper.create_env import create_single_env
 4 | from trpo_agent import trpo_agent
 5 | 
 6 | if __name__ == '__main__':
 7 |     args = get_args()
 8 |     # make environemnts
 9 |     env = create_single_env(args)
10 |     # set the random seeds
11 |     set_seeds(args)
12 |     # create trpo trainer
13 |     trpo_trainer = trpo_agent(env, args)
14 |     trpo_trainer.learn()
15 |     # close the environment
16 |     env.close()
17 | 


--------------------------------------------------------------------------------
/rl_algorithms/dqn_algos/train.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from arguments import get_args
 3 | from rl_utils.env_wrapper.create_env import create_single_env
 4 | from rl_utils.logger import logger, bench
 5 | from rl_utils.seeds.seeds import set_seeds
 6 | from dqn_agent import dqn_agent
 7 | import os
 8 | import numpy as np 
 9 | 
10 | if __name__ == '__main__':
11 |     # get arguments
12 |     args = get_args()
13 |     # start to create the environment
14 |     env = create_single_env(args)
15 |     # set seeds
16 |     set_seeds(args)
17 |     # create trainer
18 |     dqn_trainer = dqn_agent(env, args)
19 |     # start to learn 
20 |     dqn_trainer.learn()
21 |     # finally - close the environment
22 |     env.close()
23 | 


--------------------------------------------------------------------------------
/rl_algorithms/a2c/train.py:
--------------------------------------------------------------------------------
 1 | from arguments import get_args
 2 | from a2c_agent import a2c_agent
 3 | from rl_utils.env_wrapper.create_env import create_multiple_envs
 4 | from rl_utils.seeds.seeds import set_seeds
 5 | from a2c_agent import a2c_agent
 6 | import os
 7 | 
 8 | if __name__ == '__main__':
 9 |     # set signle thread
10 |     os.environ['OMP_NUM_THREADS'] = '1'
11 |     os.environ['MKL_NUM_THREADS'] = '1'
12 |     # get args
13 |     args = get_args()
14 |     # create environments
15 |     envs = create_multiple_envs(args)
16 |     # set seeds
17 |     set_seeds(args)
18 |     # create trainer
19 |     a2c_trainer = a2c_agent(envs, args)
20 |     a2c_trainer.learn()
21 |     # close the environment
22 |     envs.close()
23 | 


--------------------------------------------------------------------------------
/rl_algorithms/ddpg/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | # add ounoise here
 5 | class ounoise():
 6 |     def __init__(self, std, action_dim, mean=0, theta=0.15, dt=1e-2, x0=None):
 7 |         self.std = std
 8 |         self.mean = mean
 9 |         self.action_dim = action_dim
10 |         self.theta = theta
11 |         self.dt = dt
12 |         self.x0 = x0
13 |     
14 |     # reset the noise
15 |     def reset(self):
16 |         self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.action_dim)
17 |     
18 |     # generate noise
19 |     def noise(self):
20 |         x = self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt + \
21 |                 self.std * np.sqrt(self.dt) * np.random.normal(size=self.action_dim)
22 |         self.x_prev = x
23 |         return x
24 | 


--------------------------------------------------------------------------------
/rl_algorithms/ddpg/train.py:
--------------------------------------------------------------------------------
 1 | from ddpg_agent import ddpg_agent
 2 | from arguments import get_args
 3 | from rl_utils.seeds.seeds import set_seeds
 4 | from rl_utils.env_wrapper.create_env import create_single_env
 5 | from mpi4py import MPI
 6 | import os
 7 | 
 8 | if __name__ == '__main__':
 9 |     # set thread and mpi stuff
10 |     os.environ['OMP_NUM_THREADS'] = '1'
11 |     os.environ['MKL_NUM_THREADS'] = '1'
12 |     os.environ['IN_MPI'] = '1'
13 |     # train the network
14 |     args = get_args()
15 |     # build up the environment
16 |     env = create_single_env(args, MPI.COMM_WORLD.Get_rank())
17 |     # set the random seeds
18 |     set_seeds(args, MPI.COMM_WORLD.Get_rank())
19 |     # start traininng
20 |     ddpg_trainer = ddpg_agent(env, args)
21 |     ddpg_trainer.learn()
22 |     # close the environment
23 |     env.close()
24 | 


--------------------------------------------------------------------------------
/rl_algorithms/ppo/README.md:
--------------------------------------------------------------------------------
 1 | # Proximal Policy Optimization (PPO)
 2 | ## Instructions
 3 | 1. Train the agents - **Atari Env**:
 4 | ```bash
 5 | python train.py --env-name='<env-name>' --cuda (if you have a GPU) --env-type='atari' --lr-decay
 6 | ```
 7 | 2. Train the agents - **Mujoco Env** (we also support beta distribution, can use `--dist` flag):
 8 | ```bash
 9 | python train.py --env-name='<env-name>' --cuda (if you have a GPU) --env-type='mujoco' --num-workers=1 --nsteps=2048 --clip=0.2 --batch-size=32 --epoch=10 --lr=3e-4 --ent-coef=0 --total-frames=1000000 --vloss-coef=1 
10 | ```
11 | 3. Play the demo - Please use the same `--env-type` and `--dist` flag used in the training.
12 | ```bash
13 | python demo.py --env-name='<env name>' --env-type='<env type>' --dist='<dist-type>'
14 | ```
15 | ## Results
16 | ![](../../figures/05_ppo.png)
17 | 


--------------------------------------------------------------------------------
/rl_algorithms/a2c/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from torch.distributions.categorical import Categorical
 4 | 
 5 | # select - actions
 6 | def select_actions(pi, deterministic=False):
 7 |     cate_dist = Categorical(pi)
 8 |     if deterministic:
 9 |         return torch.argmax(pi, dim=1).item()
10 |     else:
11 |         return cate_dist.sample().unsqueeze(-1)
12 | 
13 | # get the action log prob and entropy...
14 | def evaluate_actions(pi, actions):
15 |     cate_dist = Categorical(pi)
16 |     return cate_dist.log_prob(actions.squeeze(-1)).unsqueeze(-1), cate_dist.entropy().mean()
17 | 
18 | def discount_with_dones(rewards, dones, gamma):
19 |     discounted = []
20 |     r = 0
21 |     for reward, done in zip(rewards[::-1], dones[::-1]):
22 |         r = reward + gamma * r * (1.-done)
23 |         discounted.append(r)
24 |     return discounted[::-1]
25 | 


--------------------------------------------------------------------------------
/rl_algorithms/ppo/train.py:
--------------------------------------------------------------------------------
 1 | from arguments import get_args
 2 | from ppo_agent import ppo_agent
 3 | from rl_utils.env_wrapper.create_env import create_multiple_envs, create_single_env
 4 | from rl_utils.seeds.seeds import set_seeds
 5 | import os
 6 | 
 7 | if __name__ == '__main__':
 8 |     # set signle thread
 9 |     os.environ['OMP_NUM_THREADS'] = '1'
10 |     os.environ['MKL_NUM_THREADS'] = '1'
11 |     # get arguments
12 |     args = get_args()
13 |     # start to create the environment
14 |     if args.env_type == 'atari':
15 |         envs = create_multiple_envs(args)
16 |     elif args.env_type == 'mujoco':
17 |         envs = create_single_env(args)
18 |     else:
19 |         raise NotImplementedError
20 |     # create trainer
21 |     ppo_trainer = ppo_agent(envs, args)
22 |     # start to learn
23 |     ppo_trainer.learn()
24 |     # close the environment
25 |     envs.close()
26 | 


--------------------------------------------------------------------------------
/rl_algorithms/ddpg/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | # define the actor network
 6 | class actor(nn.Module):
 7 |     def __init__(self, obs_dims, action_dims):
 8 |         super(actor, self).__init__()
 9 |         self.fc1 = nn.Linear(obs_dims, 400)
10 |         self.fc2 = nn.Linear(400, 300)
11 |         self.action_out = nn.Linear(300, action_dims)
12 | 
13 |     def forward(self, x):
14 |         x = F.relu(self.fc1(x))
15 |         x = F.relu(self.fc2(x))
16 |         actions = torch.tanh(self.action_out(x))
17 |         return actions
18 | 
19 | class critic(nn.Module):
20 |     def __init__(self, obs_dims, action_dims):
21 |         super(critic, self).__init__()
22 |         self.fc1 = nn.Linear(obs_dims, 400)
23 |         self.fc2 = nn.Linear(400 + action_dims, 300)
24 |         self.q_out = nn.Linear(300, 1)
25 | 
26 |     def forward(self, x, actions):
27 |         x = F.relu(self.fc1(x))
28 |         x = torch.cat([x, actions], dim=1)
29 |         x = F.relu(self.fc2(x))
30 |         q_value = self.q_out(x)
31 |         return q_value
32 | 


--------------------------------------------------------------------------------
/rl_algorithms/dqn_algos/demo.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from arguments import get_args
 3 | from models import net
 4 | import torch
 5 | from rl_utils.env_wrapper.atari_wrapper import make_atari, wrap_deepmind
 6 | 
 7 | def get_tensors(obs):
 8 |     obs = np.transpose(obs, (2, 0, 1))
 9 |     obs = np.expand_dims(obs, 0)
10 |     obs = torch.tensor(obs, dtype=torch.float32)
11 |     return obs
12 | 
13 | if __name__ == '__main__':
14 |     args = get_args()
15 |     # create the environment
16 |     env = make_atari(args.env_name)
17 |     env = wrap_deepmind(env, frame_stack=True)
18 |     # create the network
19 |     net = net(env.action_space.n, args.use_dueling) 
20 |     # model path
21 |     model_path = args.save_dir + args.env_name + '/model.pt'
22 |     # load the models
23 |     net.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage))
24 |     # start to test the demo
25 |     obs = env.reset()
26 |     for _ in range(2000):
27 |         env.render()
28 |         with torch.no_grad():
29 |             obs_tensor = get_tensors(obs)
30 |             action_value = net(obs_tensor)
31 |         action = torch.argmax(action_value.squeeze()).item()
32 |         obs, reward, done, _ = env.step(action)
33 |         if done:
34 |             obs = env.reset()
35 |     env.close()
36 | 


--------------------------------------------------------------------------------
/rl_utils/env_wrapper/frame_stack.py:
--------------------------------------------------------------------------------
 1 | from rl_utils.env_wrapper import VecEnvWrapper
 2 | import numpy as np
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class VecFrameStack(VecEnvWrapper):
 7 |     def __init__(self, venv, nstack):
 8 |         self.venv = venv
 9 |         self.nstack = nstack
10 |         wos = venv.observation_space  # wrapped ob space
11 |         low = np.repeat(wos.low, self.nstack, axis=-1)
12 |         high = np.repeat(wos.high, self.nstack, axis=-1)
13 |         self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype)
14 |         observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
15 |         VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
16 | 
17 |     def step_wait(self):
18 |         obs, rews, news, infos = self.venv.step_wait()
19 |         self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
20 |         for (i, new) in enumerate(news):
21 |             if new:
22 |                 self.stackedobs[i] = 0
23 |         self.stackedobs[..., -obs.shape[-1]:] = obs
24 |         return self.stackedobs, rews, news, infos
25 | 
26 |     def reset(self):
27 |         obs = self.venv.reset()
28 |         self.stackedobs[...] = 0
29 |         self.stackedobs[..., -obs.shape[-1]:] = obs
30 |         return self.stackedobs
31 | 


--------------------------------------------------------------------------------
/rl_algorithms/a2c/demo.py:
--------------------------------------------------------------------------------
 1 | from arguments import get_args
 2 | from models import net
 3 | import torch
 4 | from utils import select_actions
 5 | import cv2
 6 | import numpy as np
 7 | from rl_utils.env_wrapper.frame_stack import VecFrameStack
 8 | from rl_utils.env_wrapper.atari_wrapper import make_atari, wrap_deepmind
 9 | 
10 | # update the current observation
11 | def get_tensors(obs):
12 |     input_tensor = torch.tensor(np.transpose(obs, (2, 0, 1)), dtype=torch.float32).unsqueeze(0)
13 |     return input_tensor
14 | 
15 | if __name__ == "__main__":
16 |     args = get_args()
17 |     # create environment
18 |     #env = VecFrameStack(wrap_deepmind(make_atari(args.env_name)), 4)
19 |     env = make_atari(args.env_name)
20 |     env = wrap_deepmind(env, frame_stack=True)
21 |     # get the model path
22 |     model_path = args.save_dir + args.env_name + '/model.pt'
23 |     network = net(env.action_space.n)
24 |     network.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage)) 
25 |     obs = env.reset()
26 |     while True:
27 |         env.render()
28 |         # get the obs
29 |         with torch.no_grad():
30 |             input_tensor = get_tensors(obs)
31 |             _, pi = network(input_tensor)
32 |         actions = select_actions(pi, True)
33 |         obs, reward, done, _ = env.step([actions])
34 |     env.close()
35 | 


--------------------------------------------------------------------------------
/rl_algorithms/ppo/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from torch.distributions.normal import Normal
 4 | from torch.distributions.beta import Beta
 5 | from torch.distributions.categorical import Categorical
 6 | import random
 7 | 
 8 | def select_actions(pi, dist_type, env_type):
 9 |     if env_type == 'atari':
10 |         actions = Categorical(pi).sample()
11 |     else:
12 |         if dist_type == 'gauss':
13 |             mean, std = pi
14 |             actions = Normal(mean, std).sample()
15 |         elif dist_type == 'beta':
16 |             alpha, beta = pi
17 |             actions = Beta(alpha.detach().cpu(), beta.detach().cpu()).sample()
18 |     # return actions
19 |     return actions.detach().cpu().numpy().squeeze()
20 | 
21 | def evaluate_actions(pi, actions, dist_type, env_type):
22 |     if env_type == 'atari':
23 |         cate_dist = Categorical(pi)
24 |         log_prob = cate_dist.log_prob(actions).unsqueeze(-1)
25 |         entropy = cate_dist.entropy().mean()
26 |     else:
27 |         if dist_type == 'gauss':
28 |             mean, std = pi
29 |             normal_dist = Normal(mean, std)
30 |             log_prob = normal_dist.log_prob(actions).sum(dim=1, keepdim=True)
31 |             entropy = normal_dist.entropy().mean()
32 |         elif dist_type == 'beta':
33 |             alpha, beta = pi
34 |             beta_dist = Beta(alpha, beta)
35 |             log_prob = beta_dist.log_prob(actions).sum(dim=1, keepdim=True)
36 |             entropy = beta_dist.entropy().mean()
37 |     return log_prob, entropy
38 | 


--------------------------------------------------------------------------------
/rl_algorithms/trpo/demo.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import gym
 4 | from arguments import get_args
 5 | from models import network
 6 | 
 7 | def denormalize(x, mean, std, clip=10):
 8 |     x -= mean
 9 |     x /= (std + 1e-8)
10 |     return np.clip(x, -clip, clip)
11 | 
12 | def get_tensors(x):
13 |     return torch.tensor(x, dtype=torch.float32).unsqueeze(0)
14 | 
15 | if __name__ == '__main__':
16 |     args = get_args()
17 |     # create the environment
18 |     env = gym.make(args.env_name)
19 |     # build up the network
20 |     net = network(env.observation_space.shape[0], env.action_space.shape[0])
21 |     # load the saved model
22 |     model_path = args.save_dir + args.env_name + '/model.pt'
23 |     network_model, filters = torch.load(model_path, map_location=lambda storage, loc: storage)
24 |     net.load_state_dict(network_model)
25 |     net.eval()
26 |     for _ in range(10):
27 |         obs = denormalize(env.reset(), filters.rs.mean, filters.rs.std)
28 |         reward_total = 0
29 |         for _ in range(10000):
30 |             env.render()
31 |             obs_tensor = get_tensors(obs)
32 |             with torch.no_grad():
33 |                 _, (mean, _) = net(obs_tensor)
34 |                 action = mean.numpy().squeeze()
35 |             obs, reward, done, _ = env.step(action)
36 |             reward_total += reward
37 |             obs = denormalize(obs, filters.rs.mean, filters.rs.std)
38 |             if done:
39 |                 break
40 |         print('the reward of this episode is: {}'.format(reward_total))
41 |     env.close()
42 | 


--------------------------------------------------------------------------------
/rl_utils/experience_replay/experience_replay.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | """
 5 | define the replay buffer and corresponding algorithms like PER
 6 | 
 7 | """
 8 | 
 9 | class replay_buffer:
10 |     def __init__(self, memory_size):
11 |         self.storge = []
12 |         self.memory_size = memory_size
13 |         self.next_idx = 0
14 |     
15 |     # add the samples
16 |     def add(self, obs, action, reward, obs_, done):
17 |         data = (obs, action, reward, obs_, done)
18 |         if self.next_idx >= len(self.storge):
19 |             self.storge.append(data)
20 |         else:
21 |             self.storge[self.next_idx] = data
22 |         # get the next idx
23 |         self.next_idx = (self.next_idx + 1) % self.memory_size
24 |     
25 |     # encode samples
26 |     def _encode_sample(self, idx):
27 |         obses, actions, rewards, obses_, dones = [], [], [], [], []
28 |         for i in idx:
29 |             data = self.storge[i]
30 |             obs, action, reward, obs_, done = data
31 |             obses.append(np.array(obs, copy=False))
32 |             actions.append(np.array(action, copy=False))
33 |             rewards.append(reward)
34 |             obses_.append(np.array(obs_, copy=False))
35 |             dones.append(done)
36 |         return np.array(obses), np.array(actions), np.array(rewards), np.array(obses_), np.array(dones)
37 |     
38 |     # sample from the memory
39 |     def sample(self, batch_size):
40 |         idxes = [random.randint(0, len(self.storge) - 1) for _ in range(batch_size)]
41 |         return self._encode_sample(idxes)
42 | 


--------------------------------------------------------------------------------
/rl_algorithms/trpo/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | class network(nn.Module):
 6 |     def __init__(self, num_states, num_actions):
 7 |         super(network, self).__init__()
 8 |         # define the critic
 9 |         self.critic = critic(num_states)
10 |         self.actor = actor(num_states, num_actions)
11 | 
12 |     def forward(self, x):
13 |         state_value = self.critic(x)
14 |         pi = self.actor(x)
15 |         return state_value, pi
16 | 
17 | class critic(nn.Module):
18 |     def __init__(self, num_states):
19 |         super(critic, self).__init__()
20 |         self.fc1 = nn.Linear(num_states, 64)
21 |         self.fc2 = nn.Linear(64, 64)
22 |         self.value = nn.Linear(64, 1)
23 | 
24 |     def forward(self, x):
25 |         x = F.tanh(self.fc1(x))
26 |         x = F.tanh(self.fc2(x))
27 |         value = self.value(x)
28 |         return value
29 | 
30 | class actor(nn.Module):
31 |     def __init__(self, num_states, num_actions):
32 |         super(actor, self).__init__()
33 |         self.fc1 = nn.Linear(num_states, 64)
34 |         self.fc2 = nn.Linear(64, 64)
35 |         self.action_mean = nn.Linear(64, num_actions)
36 |         self.sigma_log = nn.Parameter(torch.zeros(1, num_actions))
37 | 
38 |     def forward(self, x):
39 |         x = F.tanh(self.fc1(x))
40 |         x = F.tanh(self.fc2(x))
41 |         mean = self.action_mean(x)
42 |         sigma_log = self.sigma_log.expand_as(mean)
43 |         sigma = torch.exp(sigma_log)
44 |         pi = (mean, sigma)
45 |         
46 |         return pi
47 | 


--------------------------------------------------------------------------------
/rl_algorithms/sac/demo.py:
--------------------------------------------------------------------------------
 1 | from arguments import get_args
 2 | import gym
 3 | import torch
 4 | import numpy as np
 5 | from models import tanh_gaussian_actor
 6 | 
 7 | if __name__ == '__main__':
 8 |     args = get_args()
 9 |     env = gym.make(args.env_name)
10 |     # get environment infos
11 |     obs_dims = env.observation_space.shape[0]
12 |     action_dims = env.action_space.shape[0]
13 |     action_max = env.action_space.high[0]
14 |     # define the network
15 |     actor_net = tanh_gaussian_actor(obs_dims, action_dims, args.hidden_size, args.log_std_min, args.log_std_max)
16 |     # load models
17 |     model_path = args.save_dir + args.env_name + '/model.pt'
18 |     # load the network weights
19 |     actor_net.load_state_dict(torch.load(model_path, map_location='cpu'))
20 |     for ep in range(5):
21 |         obs = env.reset()
22 |         reward_sum = 0
23 |         # set the maximum timesteps here...
24 |         for _ in range(1000):
25 |             env.render()
26 |             with torch.no_grad():
27 |                 obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
28 |                 mean, std = actor_net(obs_tensor)
29 |                 actions = torch.tanh(mean).detach().numpy().squeeze()
30 |                 if action_dims == 1:
31 |                     actions = np.array([actions])
32 |             obs_, reward, done, _ = env.step(action_max * actions)
33 |             reward_sum += reward
34 |             if done:
35 |                 break
36 |             obs = obs_
37 |         print('the episode is: {}, the reward is: {}'.format(ep, reward_sum))
38 |     env.close()
39 | 


--------------------------------------------------------------------------------
/rl_algorithms/trpo/arguments.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def get_args():
 4 |     parse = argparse.ArgumentParser()
 5 |     parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor of the RL')
 6 |     parse.add_argument('--env-name', type=str, default='Walker2d-v2', help='the training environment')
 7 |     parse.add_argument('--seed', type=int, default=123, help='the random seed')
 8 |     parse.add_argument('--save-dir', type=str, default='saved_models/', help='the folder to save models')
 9 |     parse.add_argument('--total-timesteps', type=int, default=int(1e6), help='the total frames')
10 |     parse.add_argument('--nsteps', type=int, default=1024, help='the steps to collect samples')
11 |     parse.add_argument('--lr', type=float, default=3e-4)
12 |     parse.add_argument('--batch-size', type=int, default=64, help='the mini batch size ot update the value function')
13 |     parse.add_argument('--vf-itrs', type=int, default=5, help='the times to update the value network')
14 |     parse.add_argument('--tau', type=float, default=0.95, help='the param to calculate the gae')
15 |     parse.add_argument('--damping', type=float, default=0.1, help='the damping coeffificent')
16 |     parse.add_argument('--max-kl', type=float, default=0.01, help='the max kl divergence')
17 |     parse.add_argument('--cuda', action='store_true', help='if use gpu')
18 |     parse.add_argument('--env-type', type=str, default='mujoco', help='the environment type')
19 |     parse.add_argument('--log-dir', type=str, default='logs', help='folder to save log files')
20 | 
21 |     args = parse.parse_args()
22 | 
23 |     return args
24 | 


--------------------------------------------------------------------------------
/rl_utils/mpi_utils/utils.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | # sync_networks across the different cores
 6 | def sync_networks(network):
 7 |     """
 8 |     netowrk is the network you want to sync
 9 | 
10 |     """
11 |     comm = MPI.COMM_WORLD
12 |     flat_params = _get_flat_params_or_grads(network, mode='params')
13 |     comm.Bcast(flat_params, root=0)
14 |     # set the flat params back to the network
15 |     _set_flat_params_or_grads(network, flat_params, mode='params')
16 | 
17 | def sync_grads(network):
18 |     flat_grads = _get_flat_params_or_grads(network, mode='grads')
19 |     comm = MPI.COMM_WORLD
20 |     global_grads = np.zeros_like(flat_grads)
21 |     comm.Allreduce(flat_grads, global_grads, op=MPI.SUM)
22 |     _set_flat_params_or_grads(network, global_grads, mode='grads')
23 | 
24 | # get the flat grads or params
25 | def _get_flat_params_or_grads(network, mode='params'):
26 |     """
27 |     include two kinds: grads and params
28 | 
29 |     """
30 |     attr = 'data' if mode == 'params' else 'grad'
31 |     return np.concatenate([getattr(param, attr).cpu().numpy().flatten() for param in network.parameters()])
32 | 
33 | def _set_flat_params_or_grads(network, flat_params, mode='params'):
34 |     """
35 |     include two kinds: grads and params
36 | 
37 |     """
38 |     attr = 'data' if mode == 'params' else 'grad'
39 |     # the pointer
40 |     pointer = 0
41 |     for param in network.parameters():
42 |         getattr(param, attr).copy_(torch.tensor(flat_params[pointer:pointer + param.data.numel()]).view_as(param.data))
43 |         pointer += param.data.numel()
44 | 


--------------------------------------------------------------------------------
/rl_algorithms/ddpg/demo.py:
--------------------------------------------------------------------------------
 1 | from arguments import get_args
 2 | import gym
 3 | from models import actor
 4 | import torch
 5 | import numpy as np
 6 | 
 7 | def normalize(obs, mean, std, clip):
 8 |     return np.clip((obs - mean) / std, -clip, clip)
 9 | 
10 | if __name__ == '__main__':
11 |     args = get_args()
12 |     env = gym.make(args.env_name)
13 |     # get environment infos
14 |     obs_dims = env.observation_space.shape[0]
15 |     action_dims = env.action_space.shape[0]
16 |     action_max = env.action_space.high[0]
17 |     # define the network
18 |     actor_net = actor(obs_dims, action_dims)
19 |     # load models
20 |     model_path = args.save_dir + args.env_name + '/model.pt'
21 |     model, mean, std = torch.load(model_path, map_location=lambda storage, loc: storage)
22 |     # load models into the network
23 |     actor_net.load_state_dict(model)
24 |     for ep in range(10):
25 |         obs = env.reset()
26 |         reward_sum = 0
27 |         while True:
28 |             env.render()
29 |             with torch.no_grad():
30 |                 norm_obs = normalize(obs, mean, std, args.clip_range)
31 |                 norm_obs_tensor = torch.tensor(norm_obs, dtype=torch.float32).unsqueeze(0)
32 |                 actions = actor_net(norm_obs_tensor)
33 |                 actions = actions.detach().numpy().squeeze()
34 |                 if action_dims == 1:
35 |                     actions = np.array([actions])
36 |             obs_, reward, done, _ = env.step(action_max * actions)
37 |             reward_sum += reward
38 |             if done:
39 |                 break
40 |             obs = obs_
41 |         print('the episode is: {}, the reward is: {}'.format(ep, reward_sum))
42 |     env.close()
43 | 


--------------------------------------------------------------------------------
/rl_algorithms/dqn_algos/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | # linear exploration schedule
 5 | class linear_schedule:
 6 |     def __init__(self, total_timesteps, final_ratio, init_ratio=1.0):
 7 |         self.total_timesteps = total_timesteps
 8 |         self.final_ratio = final_ratio
 9 |         self.init_ratio = init_ratio
10 | 
11 |     def get_value(self, timestep):
12 |         frac = min(float(timestep) / self.total_timesteps, 1.0)
13 |         return self.init_ratio - frac * (self.init_ratio - self.final_ratio)
14 | 
15 | # select actions
16 | def select_actions(action_value, explore_eps):
17 |     action_value = action_value.cpu().numpy().squeeze()
18 |     # select actions
19 |     action = np.argmax(action_value) if random.random() > explore_eps else np.random.randint(action_value.shape[0])
20 |     return action
21 | 
22 | # record the reward info of the dqn experiments
23 | class reward_recorder:
24 |     def __init__(self, history_length=100):
25 |         self.history_length = history_length
26 |         # the empty buffer to store rewards 
27 |         self.buffer = [0.0]
28 |         self._episode_length = 1
29 |     
30 |     # add rewards
31 |     def add_rewards(self, reward):
32 |         self.buffer[-1] += reward
33 | 
34 |     # start new episode
35 |     def start_new_episode(self):
36 |         if self.get_length >= self.history_length:
37 |             self.buffer.pop(0)
38 |         # append new one
39 |         self.buffer.append(0.0)
40 |         self._episode_length += 1
41 | 
42 |     # get length of buffer
43 |     @property
44 |     def get_length(self):
45 |         return len(self.buffer)
46 |     
47 |     @property
48 |     def mean(self):
49 |         return np.mean(self.buffer)
50 |     
51 |     # get the length of total episodes
52 |     @property 
53 |     def num_episodes(self):
54 |         return self._episode_length
55 | 


--------------------------------------------------------------------------------
/rl_algorithms/sac/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | # the flatten mlp
 6 | class flatten_mlp(nn.Module):
 7 |     #TODO: add the initialization method for it
 8 |     def __init__(self, input_dims, hidden_size, action_dims=None):
 9 |         super(flatten_mlp, self).__init__()
10 |         self.fc1 = nn.Linear(input_dims, hidden_size) if action_dims is None else nn.Linear(input_dims + action_dims, hidden_size)
11 |         self.fc2 = nn.Linear(hidden_size, hidden_size)
12 |         self.q_value = nn.Linear(hidden_size, 1)
13 | 
14 |     def forward(self, obs, action=None):
15 |         inputs = torch.cat([obs, action], dim=1) if action is not None else obs
16 |         x = F.relu(self.fc1(inputs))
17 |         x = F.relu(self.fc2(x))
18 |         output = self.q_value(x)
19 |         return output
20 | 
21 | # define the policy network - tanh gaussian policy network
22 | # TODO: Not use the log std
23 | class tanh_gaussian_actor(nn.Module):
24 |     def __init__(self, input_dims, action_dims, hidden_size, log_std_min, log_std_max):
25 |         super(tanh_gaussian_actor, self).__init__()
26 |         self.fc1 = nn.Linear(input_dims, hidden_size)
27 |         self.fc2 = nn.Linear(hidden_size, hidden_size)
28 |         self.mean = nn.Linear(hidden_size, action_dims)
29 |         self.log_std = nn.Linear(hidden_size, action_dims)
30 |         # the log_std_min and log_std_max
31 |         self.log_std_min = log_std_min
32 |         self.log_std_max = log_std_max
33 | 
34 |     def forward(self, obs):
35 |         x = F.relu(self.fc1(obs))
36 |         x = F.relu(self.fc2(x))
37 |         mean = self.mean(x)
38 |         log_std = self.log_std(x)
39 |         # clamp the log std
40 |         log_std = torch.clamp(log_std, min=self.log_std_min, max=self.log_std_max)
41 |         # the reparameterization trick
42 |         # return mean and std
43 |         return (mean, torch.exp(log_std))
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # DS Store
107 | .DS_Store
108 | 
109 | #saved_model
110 | *.pth
111 | 
112 | *.pt
113 | 
114 | *.log
115 | 
116 | *.txt
117 | *.csv
118 | logs/
119 | 


--------------------------------------------------------------------------------
/rl_algorithms/a2c/arguments.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def get_args():
 4 |     parse = argparse.ArgumentParser()
 5 |     parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor of RL')
 6 |     parse.add_argument('--seed', type=int, default=123, help='the random seeds')
 7 |     parse.add_argument('--env-name', type=str, default='BreakoutNoFrameskip-v4', help='the environment name')
 8 |     parse.add_argument('--lr', type=float, default=7e-4, help='learning rate of the algorithm')
 9 |     parse.add_argument('--value-loss-coef', type=float, default=0.5, help='the coefficient of value loss')
10 |     parse.add_argument('--tau', type=float, default=0.95, help='gae coefficient')
11 |     parse.add_argument('--cuda', action='store_true', help='use cuda do the training')
12 |     parse.add_argument('--total-frames', type=int, default=20000000, help='the total frames for training')
13 |     parse.add_argument('--eps', type=float, default=1e-5, help='param for adam optimizer')
14 |     parse.add_argument('--save-dir', type=str, default='saved_models/', help='the folder to save models')
15 |     parse.add_argument('--nsteps', type=int, default=5, help='the steps to update the network')
16 |     parse.add_argument('--num-workers', type=int, default=16, help='the number of cpu you use')
17 |     parse.add_argument('--entropy-coef', type=float, default=0.01, help='entropy-reg')
18 |     parse.add_argument('--log-interval', type=int, default=100, help='the log interval')
19 |     parse.add_argument('--alpha', type=float, default=0.99, help='the alpha coe of RMSprop')
20 |     parse.add_argument('--max-grad-norm', type=float, default=0.5, help='the grad clip')
21 |     parse.add_argument('--use-gae', action='store_true', help='use-gae')
22 |     parse.add_argument('--log-dir', type=str, default='logs', help='log dir')
23 |     parse.add_argument('--env-type', type=str, default='atari', help='the type of the environment')
24 | 
25 |     args = parse.parse_args()
26 | 
27 |     return args
28 | 


--------------------------------------------------------------------------------
/rl_utils/running_filter/running_filter.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | import numpy as np
 3 | 
 4 | # this is from the https://github.com/ikostrikov/pytorch-trpo/blob/master/running_state.py
 5 | 
 6 | # from https://github.com/joschu/modular_rl
 7 | # http://www.johndcook.com/blog/standard_deviation/
 8 | class RunningStat(object):
 9 |     def __init__(self, shape):
10 |         self._n = 0
11 |         self._M = np.zeros(shape)
12 |         self._S = np.zeros(shape)
13 | 
14 |     def push(self, x):
15 |         x = np.asarray(x)
16 |         assert x.shape == self._M.shape
17 |         self._n += 1
18 |         if self._n == 1:
19 |             self._M[...] = x
20 |         else:
21 |             oldM = self._M.copy()
22 |             self._M[...] = oldM + (x - oldM) / self._n
23 |             self._S[...] = self._S + (x - oldM) * (x - self._M)
24 | 
25 |     @property
26 |     def n(self):
27 |         return self._n
28 | 
29 |     @property
30 |     def mean(self):
31 |         return self._M
32 | 
33 |     @property
34 |     def var(self):
35 |         return self._S / (self._n - 1) if self._n > 1 else np.square(self._M)
36 | 
37 |     @property
38 |     def std(self):
39 |         return np.sqrt(self.var)
40 | 
41 |     @property
42 |     def shape(self):
43 |         return self._M.shape
44 | 
45 | 
46 | class ZFilter:
47 |     """
48 |     y = (x-mean)/std
49 |     using running estimates of mean,std
50 |     """
51 | 
52 |     def __init__(self, shape, demean=True, destd=True, clip=10.0):
53 |         self.demean = demean
54 |         self.destd = destd
55 |         self.clip = clip
56 | 
57 |         self.rs = RunningStat(shape)
58 | 
59 |     def __call__(self, x, update=True):
60 |         if update: self.rs.push(x)
61 |         if self.demean:
62 |             x = x - self.rs.mean
63 |         if self.destd:
64 |             x = x / (self.rs.std + 1e-8)
65 |         if self.clip:
66 |             x = np.clip(x, -self.clip, self.clip)
67 |         return x
68 | 
69 |     def output_shape(self, input_space):
70 |         return input_space.shape
71 | 


--------------------------------------------------------------------------------
/rl_algorithms/a2c/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | # the convolution layer of deepmind
 6 | class deepmind(nn.Module):
 7 |     def __init__(self):
 8 |         super(deepmind, self).__init__()
 9 |         self.conv1 = nn.Conv2d(4, 32, 8, stride=4)
10 |         self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
11 |         self.conv3 = nn.Conv2d(64, 32, 3, stride=1)
12 |         self.fc1 = nn.Linear(32 * 7 * 7, 512) 
13 |         # start to do the init...
14 |         nn.init.orthogonal_(self.conv1.weight.data, gain=nn.init.calculate_gain('relu'))
15 |         nn.init.orthogonal_(self.conv2.weight.data, gain=nn.init.calculate_gain('relu'))
16 |         nn.init.orthogonal_(self.conv3.weight.data, gain=nn.init.calculate_gain('relu'))
17 |         nn.init.orthogonal_(self.fc1.weight.data, gain=nn.init.calculate_gain('relu'))
18 |         # init the bias...
19 |         nn.init.constant_(self.conv1.bias.data, 0)
20 |         nn.init.constant_(self.conv2.bias.data, 0)
21 |         nn.init.constant_(self.conv3.bias.data, 0)
22 |         nn.init.constant_(self.fc1.bias.data, 0)
23 |         
24 |     def forward(self, x):
25 |         x = F.relu(self.conv1(x))
26 |         x = F.relu(self.conv2(x))
27 |         x = F.relu(self.conv3(x))
28 |         x = x.view(-1, 32 * 7 * 7)
29 |         x = F.relu(self.fc1(x))
30 |         return x
31 | 
32 | # in the initial, just the nature CNN
33 | class net(nn.Module):
34 |     def __init__(self, num_actions):
35 |         super(net, self).__init__()
36 |         self.cnn_layer = deepmind()
37 |         self.critic = nn.Linear(512, 1)
38 |         self.actor = nn.Linear(512, num_actions)
39 |         # init the linear layer..
40 |         nn.init.orthogonal_(self.critic.weight.data)
41 |         nn.init.constant_(self.critic.bias.data, 0)
42 |         # init the policy layer...
43 |         nn.init.orthogonal_(self.actor.weight.data, gain=0.01)
44 |         nn.init.constant_(self.actor.bias.data, 0)
45 | 
46 |     def forward(self, inputs):
47 |         x = self.cnn_layer(inputs / 255.0)
48 |         value = self.critic(x)
49 |         pi = F.softmax(self.actor(x), dim=1)
50 |         return value, pi
51 | 


--------------------------------------------------------------------------------
/rl_algorithms/ddpg/arguments.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def get_args():
 4 |     parse = argparse.ArgumentParser(description='ddpg')
 5 |     parse.add_argument('--env-name', type=str, default='Pendulum-v0', help='the training environment')
 6 |     parse.add_argument('--lr-actor', type=float, default=1e-4, help='the lr of the actor')
 7 |     parse.add_argument('--lr-critic', type=float, default=1e-3, help='the lr of the critic')
 8 |     parse.add_argument('--critic-l2-reg', type=float, default=1e-2, help='the critic reg')
 9 |     parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor')
10 |     parse.add_argument('--nb-epochs', type=int, default=500, help='the epochs to train the network')
11 |     parse.add_argument('--nb-cycles', type=int, default=20)
12 |     parse.add_argument('--nb-train', type=int, default=50, help='number to train the agent')
13 |     parse.add_argument('--nb-rollout-steps', type=int, default=100, help='steps to collect samples')
14 |     parse.add_argument('--nb-test-rollouts', type=int, default=10, help='the number of test')
15 |     parse.add_argument('--batch-size', type=int, default=128, help='the batch size to update network')
16 |     parse.add_argument('--replay-size', type=int, default=int(1e6), help='the size of the replay buffer')
17 |     parse.add_argument('--clip-range', type=float, default=5, help='clip range of the observation')
18 |     parse.add_argument('--save-dir', type=str, default='saved_models/', help='the place save the models')
19 |     parse.add_argument('--polyak', type=float, default=0.95, help='the expoential weighted coefficient.')
20 |     parse.add_argument('--total-frames', type=int, default=int(1e6), help='total frames')
21 |     parse.add_argument('--log-dir', type=str, default='logs', help='place to save log files')
22 |     parse.add_argument('--env-type', type=str, default=None, help='environment type')
23 |     parse.add_argument('--seed', type=int, default=123, help='random seed')
24 |     parse.add_argument('--display-interval', type=int, default=10, help='interval to display')
25 |     # ddpg not support gpu
26 |     parse.add_argument('--cuda', action='store_true', help='if use GPU')
27 | 
28 |     args = parse.parse_args()
29 |     return args
30 | 


--------------------------------------------------------------------------------
/rl_algorithms/trpo/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from torch.distributions.normal import Normal
 4 | 
 5 | # select actions
 6 | def select_actions(pi):
 7 |     mean, std = pi
 8 |     normal_dist = Normal(mean, std)
 9 |     return normal_dist.sample().detach().numpy().squeeze()
10 | 
11 | # evaluate the actions
12 | def eval_actions(pi, actions):
13 |     mean, std = pi
14 |     normal_dist = Normal(mean, std)
15 |     return normal_dist.log_prob(actions).sum(dim=1, keepdim=True)
16 | 
17 | # conjugated gradient
18 | def conjugated_gradient(fvp, b, update_steps, obs, pi_old, residual_tol=1e-10):
19 |     # the initial solution is zero
20 |     x = torch.zeros(b.size(), dtype=torch.float32)
21 |     r = b.clone()
22 |     p = b.clone()
23 |     rdotr = torch.dot(r, r)
24 |     for i in range(update_steps):
25 |         fv_product = fvp(p, obs, pi_old)
26 |         alpha = rdotr / torch.dot(p, fv_product)
27 |         x = x + alpha * p
28 |         r = r - alpha * fv_product
29 |         new_rdotr = torch.dot(r, r)
30 |         beta = new_rdotr / rdotr 
31 |         p = r + beta * p
32 |         rdotr = new_rdotr
33 |         # if less than residual tot.. break
34 |         if rdotr < residual_tol:
35 |             break
36 |     return x
37 | 
38 | # line search
39 | def line_search(model, loss_fn, x, full_step, expected_rate, obs, adv, actions, pi_old, max_backtracks=10, accept_ratio=0.1):
40 |     fval = loss_fn(obs, adv, actions, pi_old).data
41 |     for (_n_backtracks, stepfrac) in enumerate(0.5**np.arange(max_backtracks)):
42 |         xnew = x + stepfrac * full_step
43 |         set_flat_params_to(model, xnew)
44 |         new_fval = loss_fn(obs, adv, actions, pi_old).data
45 |         actual_improve = fval - new_fval
46 |         expected_improve = expected_rate * stepfrac
47 |         ratio = actual_improve / expected_improve
48 |         if ratio.item() > accept_ratio and actual_improve.item() > 0:
49 |             return True, xnew
50 |     return False, x
51 | 
52 | 
53 | def set_flat_params_to(model, flat_params):
54 |     prev_indx = 0
55 |     for param in model.parameters():
56 |         flat_size = int(np.prod(list(param.size())))
57 |         param.data.copy_(flat_params[prev_indx:prev_indx + flat_size].view(param.size()))
58 |         prev_indx += flat_size
59 | 


--------------------------------------------------------------------------------
/rl_algorithms/dqn_algos/arguments.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def get_args():
 4 |     parse = argparse.ArgumentParser()
 5 |     parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor of RL')
 6 |     parse.add_argument('--seed', type=int, default=123, help='the random seeds')
 7 |     parse.add_argument('--env-name', type=str, default='PongNoFrameskip-v4', help='the environment name')
 8 |     parse.add_argument('--batch-size', type=int, default=32, help='the batch size of updating')
 9 |     parse.add_argument('--lr', type=float, default=1e-4, help='learning rate of the algorithm')
10 |     parse.add_argument('--buffer-size', type=int, default=10000, help='the size of the buffer')
11 |     parse.add_argument('--cuda', action='store_true', help='if use the gpu')
12 |     parse.add_argument('--init-ratio', type=float, default=1, help='the initial exploration ratio')
13 |     parse.add_argument('--exploration_fraction', type=float, default=0.1, help='decide how many steps to do the exploration')
14 |     parse.add_argument('--final-ratio', type=float, default=0.01, help='the final exploration ratio')
15 |     parse.add_argument('--grad-norm-clipping', type=float, default=10, help='the gradient clipping')
16 |     parse.add_argument('--total-timesteps', type=int, default=int(1e7), help='the total timesteps to train network')
17 |     parse.add_argument('--learning-starts', type=int, default=10000, help='the frames start to learn')
18 |     parse.add_argument('--train-freq', type=int, default=4, help='the frequency to update the network')
19 |     parse.add_argument('--target-network-update-freq', type=int, default=1000, help='the frequency to update the target network')
20 |     parse.add_argument('--save-dir', type=str, default='saved_models/', help='the folder to save models')
21 |     parse.add_argument('--display-interval', type=int, default=10, help='the display interval')
22 |     parse.add_argument('--env-type', type=str, default='atari', help='the environment type')
23 |     parse.add_argument('--log-dir', type=str, default='logs', help='dir to save log information')
24 |     parse.add_argument('--use-double-net', action='store_true', help='use double dqn to train the agent')
25 |     parse.add_argument('--use-dueling', action='store_true', help='use dueling to train the agent')
26 | 
27 |     args = parse.parse_args()
28 | 
29 |     return args
30 | 


--------------------------------------------------------------------------------
/rl_algorithms/ppo/arguments.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def get_args():
 4 |     parse = argparse.ArgumentParser()
 5 |     parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor of RL')
 6 |     parse.add_argument('--seed', type=int, default=123, help='the random seeds')
 7 |     parse.add_argument('--num-workers', type=int, default=8, help='the number of workers to collect samples')
 8 |     parse.add_argument('--env-name', type=str, default='PongNoFrameskip-v4', help='the environment name')
 9 |     parse.add_argument('--batch-size', type=int, default=4, help='the batch size of updating')
10 |     parse.add_argument('--lr', type=float, default=2.5e-4, help='learning rate of the algorithm')
11 |     parse.add_argument('--epoch', type=int, default=4, help='the epoch during training')
12 |     parse.add_argument('--nsteps', type=int, default=128, help='the steps to collect samples')
13 |     parse.add_argument('--vloss-coef', type=float, default=0.5, help='the coefficient of value loss')
14 |     parse.add_argument('--ent-coef', type=float, default=0.01, help='the entropy loss coefficient')
15 |     parse.add_argument('--tau', type=float, default=0.95, help='gae coefficient')
16 |     parse.add_argument('--cuda', action='store_true', help='use cuda do the training')
17 |     parse.add_argument('--total-frames', type=int, default=20000000, help='the total frames for training')
18 |     parse.add_argument('--dist', type=str, default='gauss', help='the distributions for sampling actions')
19 |     parse.add_argument('--eps', type=float, default=1e-5, help='param for adam optimizer')
20 |     parse.add_argument('--clip', type=float, default=0.1, help='the ratio clip param')
21 |     parse.add_argument('--save-dir', type=str, default='saved_models/', help='the folder to save models')
22 |     parse.add_argument('--lr-decay', action='store_true', help='if using the learning rate decay during decay')
23 |     parse.add_argument('--max-grad-norm', type=float, default=0.5, help='grad norm')
24 |     parse.add_argument('--display-interval', type=int, default=10, help='the interval that display log information')
25 |     parse.add_argument('--env-type', type=str, default='atari', help='the type of the environment')
26 |     parse.add_argument('--log-dir', type=str, default='logs', help='the folders to save the log files')
27 | 
28 |     args = parse.parse_args()
29 | 
30 |     return args
31 | 


--------------------------------------------------------------------------------
/rl_utils/env_wrapper/create_env.py:
--------------------------------------------------------------------------------
 1 | from rl_utils.env_wrapper.atari_wrapper import make_atari, wrap_deepmind
 2 | from rl_utils.env_wrapper.multi_envs_wrapper import SubprocVecEnv
 3 | from rl_utils.env_wrapper.frame_stack import VecFrameStack
 4 | from rl_utils.logger import logger, bench
 5 | import os
 6 | import gym
 7 | 
 8 | """
 9 | this functions is to create the environments
10 | 
11 | """
12 | 
13 | def create_single_env(args, rank=0):
14 |     # setup the log files
15 |     if rank == 0:
16 |         if not os.path.exists(args.log_dir):
17 |             os.mkdir(args.log_dir)
18 |         log_path = args.log_dir + '/{}/'.format(args.env_name)
19 |         logger.configure(log_path)
20 |     # start to create environment
21 |     if args.env_type == 'atari':
22 |         # create the environment
23 |         env = make_atari(args.env_name)
24 |         # the monitor
25 |         env = bench.Monitor(env, logger.get_dir())
26 |         # use the deepmind environment wrapper
27 |         env = wrap_deepmind(env, frame_stack=True)
28 |     else:
29 |         env = gym.make(args.env_name)
30 |         # add log information
31 |         env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
32 |     # set seeds to the environment to make sure the reproducebility
33 |     env.seed(args.seed + rank)
34 |     return env
35 | 
36 | # create multiple environments - for multiple
37 | def create_multiple_envs(args):
38 |     # now only support the atari games
39 |     if args.env_type == 'atari':
40 |         def make_env(rank):
41 |             def _thunk():
42 |                 if not os.path.exists(args.log_dir):
43 |                     os.mkdir(args.log_dir)
44 |                 log_path = args.log_dir + '/{}/'.format(args.env_name)
45 |                 logger.configure(log_path)
46 |                 env = make_atari(args.env_name)
47 |                 # set the seed for the environment
48 |                 env.seed(args.seed + rank)
49 |                 # set loggler
50 |                 env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
51 |                 # use the deepmind environment wrapper
52 |                 env = wrap_deepmind(env)
53 |                 return env
54 |             return _thunk
55 |             # put into sub processing 
56 |         envs = SubprocVecEnv([make_env(i) for i in range(args.num_workers)])
57 |         # then, frame stack
58 |         envs = VecFrameStack(envs, 4)
59 |     else:
60 |         raise NotImplementedError
61 |     return envs
62 | 
63 | 


--------------------------------------------------------------------------------
/rl_algorithms/dqn_algos/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | # the convolution layer of deepmind
 6 | class deepmind(nn.Module):
 7 |     def __init__(self):
 8 |         super(deepmind, self).__init__()
 9 |         self.conv1 = nn.Conv2d(4, 32, 8, stride=4)
10 |         self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
11 |         self.conv3 = nn.Conv2d(64, 32, 3, stride=1)
12 |         
13 |         # start to do the init...
14 |         nn.init.orthogonal_(self.conv1.weight.data, gain=nn.init.calculate_gain('relu'))
15 |         nn.init.orthogonal_(self.conv2.weight.data, gain=nn.init.calculate_gain('relu'))
16 |         nn.init.orthogonal_(self.conv3.weight.data, gain=nn.init.calculate_gain('relu'))
17 |         # init the bias...
18 |         nn.init.constant_(self.conv1.bias.data, 0)
19 |         nn.init.constant_(self.conv2.bias.data, 0)
20 |         nn.init.constant_(self.conv3.bias.data, 0)
21 |         
22 |     def forward(self, x):
23 |         x = F.relu(self.conv1(x))
24 |         x = F.relu(self.conv2(x))
25 |         x = F.relu(self.conv3(x))
26 |         x = x.view(-1, 32 * 7 * 7)
27 | 
28 |         return x
29 | 
30 | # in the initial, just the nature CNN
31 | class net(nn.Module):
32 |     def __init__(self, num_actions, use_dueling=False):
33 |         super(net, self).__init__()
34 |         # if use the dueling network
35 |         self.use_dueling = use_dueling
36 |         # define the network
37 |         self.cnn_layer = deepmind()
38 |         # if not use dueling
39 |         if not self.use_dueling:
40 |             self.fc1 = nn.Linear(32 * 7 * 7, 256)
41 |             self.action_value = nn.Linear(256, num_actions)
42 |         else:
43 |             # the layer for dueling network architecture
44 |             self.action_fc = nn.Linear(32 * 7 * 7, 256)
45 |             self.state_value_fc = nn.Linear(32 * 7 * 7, 256)
46 |             self.action_value = nn.Linear(256, num_actions)
47 |             self.state_value = nn.Linear(256, 1)
48 | 
49 |     def forward(self, inputs):
50 |         x = self.cnn_layer(inputs / 255.0)
51 |         if not self.use_dueling:
52 |             x = F.relu(self.fc1(x))
53 |             action_value_out = self.action_value(x)
54 |         else:
55 |             # get the action value
56 |             action_fc = F.relu(self.action_fc(x))
57 |             action_value = self.action_value(action_fc)
58 |             # get the state value
59 |             state_value_fc = F.relu(self.state_value_fc(x))
60 |             state_value = self.state_value(state_value_fc)
61 |             # action value mean
62 |             action_value_mean = torch.mean(action_value, dim=1, keepdim=True)
63 |             action_value_center = action_value - action_value_mean
64 |             # Q = V + A
65 |             action_value_out = state_value + action_value_center
66 |         return action_value_out
67 | 


--------------------------------------------------------------------------------
/rl_algorithms/ppo/demo.py:
--------------------------------------------------------------------------------
 1 | from arguments import get_args
 2 | from models import cnn_net, mlp_net
 3 | import torch
 4 | import cv2
 5 | import numpy as np
 6 | import gym
 7 | from rl_utils.env_wrapper.frame_stack import VecFrameStack
 8 | from rl_utils.env_wrapper.atari_wrapper import make_atari, wrap_deepmind
 9 | 
10 | # denormalize
11 | def normalize(x, mean, std, clip=10):
12 |     x -= mean
13 |     x /= (std + 1e-8)
14 |     return np.clip(x, -clip, clip)
15 | 
16 | # get tensors for the agent
17 | def get_tensors(obs, env_type, filters=None):
18 |     if env_type == 'atari':
19 |         tensor = torch.tensor(np.transpose(obs, (2, 0, 1)), dtype=torch.float32).unsqueeze(0)
20 |     elif env_type == 'mujoco':
21 |         tensor = torch.tensor(normalize(obs, filters.rs.mean, filters.rs.std), dtype=torch.float32).unsqueeze(0)
22 |     return tensor
23 | 
24 | if __name__ == '__main__':
25 |     # get the arguments
26 |     args = get_args()
27 |     # create the environment
28 |     if args.env_type == 'atari':
29 |         env = make_atari(args.env_name)
30 |         env = wrap_deepmind(env, frame_stack=True)
31 |     elif args.env_type == 'mujoco':
32 |         env = gym.make(args.env_name)
33 |     # get the model path
34 |     model_path = args.save_dir + args.env_name + '/model.pt'
35 |     # create the network
36 |     if args.env_type == 'atari':
37 |         network = cnn_net(env.action_space.n)
38 |         network.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage))
39 |         filters = None
40 |     elif args.env_type == 'mujoco':
41 |         network = mlp_net(env.observation_space.shape[0], env.action_space.shape[0], args.dist)
42 |         net_models, filters = torch.load(model_path, map_location=lambda storage, loc: storage)
43 |         # load models 
44 |         network.load_state_dict(net_models)
45 |     # start to play the demo
46 |     obs = env.reset()
47 |     reward_total = 0
48 |     # just one episode
49 |     while True:
50 |         env.render()
51 |         with torch.no_grad():
52 |             obs_tensor = get_tensors(obs, args.env_type, filters)
53 |             _, pi = network(obs_tensor)
54 |             # get actions
55 |             if args.env_type == 'atari':
56 |                 actions = torch.argmax(pi, dim=1).item()
57 |             elif args.env_type == 'mujoco':
58 |                 if args.dist == 'gauss':
59 |                     mean, _ = pi
60 |                     actions = mean.numpy().squeeze()
61 |                 elif args.dist == 'beta':
62 |                     alpha, beta = pi
63 |                     actions = (alpha - 1) / (alpha + beta - 2)
64 |                     actions = actions.numpy().squeeze()
65 |                     actions = -1 + 2 * actions 
66 |         obs_, reward, done, _ = env.step(actions)
67 |         reward_total += reward
68 |         if done:
69 |             break
70 |         obs = obs_
71 |     print('the rewrads is: {}'.format(reward_total))
72 | 


--------------------------------------------------------------------------------
/rl_utils/mpi_utils/normalizer.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import numpy as np
 3 | from mpi4py import MPI
 4 | 
 5 | class normalizer:
 6 |     def __init__(self, size, eps=1e-2, default_clip_range=np.inf):
 7 |         self.size = size
 8 |         self.eps = eps
 9 |         self.default_clip_range = default_clip_range
10 |         # some local information
11 |         self.local_sum = np.zeros(self.size, np.float32)
12 |         self.local_sumsq = np.zeros(self.size, np.float32)
13 |         self.local_count = np.zeros(1, np.float32)
14 |         # get the total sum sumsq and sum count
15 |         self.total_sum = np.zeros(self.size, np.float32)
16 |         self.total_sumsq = np.zeros(self.size, np.float32)
17 |         self.total_count = np.ones(1, np.float32)
18 |         # get the mean and std
19 |         self.mean = np.zeros(self.size, np.float32)
20 |         self.std = np.ones(self.size, np.float32)
21 |         # thread locker
22 |         self.lock = threading.Lock()
23 |     
24 |     # update the parameters of the normalizer
25 |     def update(self, v):
26 |         v = v.reshape(-1, self.size)
27 |         # do the computing
28 |         with self.lock:
29 |             self.local_sum += v.sum(axis=0)
30 |             self.local_sumsq += (np.square(v)).sum(axis=0)
31 |             self.local_count[0] += v.shape[0]
32 | 
33 |     # sync the parameters across the cpus
34 |     def sync(self, local_sum, local_sumsq, local_count):
35 |         local_sum[...] = self._mpi_average(local_sum)
36 |         local_sumsq[...] = self._mpi_average(local_sumsq)
37 |         local_count[...] = self._mpi_average(local_count)
38 |         return local_sum, local_sumsq, local_count
39 | 
40 |     def recompute_stats(self):
41 |         with self.lock:
42 |             local_count = self.local_count.copy()
43 |             local_sum = self.local_sum.copy()
44 |             local_sumsq = self.local_sumsq.copy()
45 |             # reset
46 |             self.local_count[...] = 0
47 |             self.local_sum[...] = 0
48 |             self.local_sumsq[...] = 0
49 |         # synrc the stats
50 |         sync_sum, sync_sumsq, sync_count = self.sync(local_sum, local_sumsq, local_count)
51 |         # update the total stuff
52 |         self.total_sum += sync_sum
53 |         self.total_sumsq += sync_sumsq
54 |         self.total_count += sync_count
55 |         # calculate the new mean and std
56 |         self.mean = self.total_sum / self.total_count
57 |         self.std = np.sqrt(np.maximum(np.square(self.eps), (self.total_sumsq / self.total_count) - np.square(self.total_sum / self.total_count)))
58 |     
59 |     # average across the cpu's data
60 |     def _mpi_average(self, x):
61 |         buf = np.zeros_like(x)
62 |         MPI.COMM_WORLD.Allreduce(x, buf, op=MPI.SUM)
63 |         buf /= MPI.COMM_WORLD.Get_size()
64 |         return buf
65 | 
66 |     # normalize the observation
67 |     def normalize(self, v, clip_range=None):
68 |         if clip_range is None:
69 |             clip_range = self.default_clip_range
70 |         return np.clip((v - self.mean) / (self.std), -clip_range, clip_range)
71 | 


--------------------------------------------------------------------------------
/rl_algorithms/sac/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from torch.distributions.normal import Normal
 4 | from torch.distributions import Distribution
 5 | 
 6 | """
 7 | the tanhnormal distributions from rlkit may not stable
 8 | 
 9 | """
10 | class tanh_normal(Distribution):
11 |     def __init__(self, normal_mean, normal_std, epsilon=1e-6, cuda=False):
12 |         self.normal_mean = normal_mean
13 |         self.normal_std = normal_std
14 |         self.cuda = cuda
15 |         self.normal = Normal(normal_mean, normal_std)
16 |         self.epsilon = epsilon
17 | 
18 |     def sample_n(self, n, return_pre_tanh_value=False):
19 |         z = self.normal.sample_n(n)
20 |         if return_pre_tanh_value:
21 |             return torch.tanh(z), z
22 |         else:
23 |             return torch.tanh(z)
24 | 
25 |     def log_prob(self, value, pre_tanh_value=None):
26 |         """
27 |         :param value: some value, x
28 |         :param pre_tanh_value: arctanh(x)
29 |         :return:
30 |         """
31 |         if pre_tanh_value is None:
32 |             pre_tanh_value = torch.log((1 + value) / (1 - value)) / 2
33 |         return self.normal.log_prob(pre_tanh_value) - torch.log(1 - value * value + self.epsilon)
34 | 
35 |     def sample(self, return_pretanh_value=False):
36 |         """
37 |         Gradients will and should *not* pass through this operation.
38 | 
39 |         See https://github.com/pytorch/pytorch/issues/4620 for discussion.
40 |         """
41 |         z = self.normal.sample().detach()
42 |         if return_pretanh_value:
43 |             return torch.tanh(z), z
44 |         else:
45 |             return torch.tanh(z)
46 | 
47 |     def rsample(self, return_pretanh_value=False):
48 |         """
49 |         Sampling in the reparameterization case.
50 |         """
51 |         sample_mean = torch.zeros(self.normal_mean.size(), dtype=torch.float32, device='cuda' if self.cuda else 'cpu')
52 |         sample_std = torch.ones(self.normal_std.size(), dtype=torch.float32, device='cuda' if self.cuda else 'cpu')
53 |         z = (self.normal_mean + self.normal_std * Normal(sample_mean, sample_std).sample())
54 |         z.requires_grad_()
55 |         if return_pretanh_value:
56 |             return torch.tanh(z), z
57 |         else:
58 |             return torch.tanh(z)
59 | 
60 | # get action_infos
61 | class get_action_info:
62 |     def __init__(self, pis, cuda=False):
63 |         self.mean, self.std = pis
64 |         self.dist = tanh_normal(normal_mean=self.mean, normal_std=self.std, cuda=cuda)
65 |     
66 |     # select actions
67 |     def select_actions(self, exploration=True, reparameterize=True):
68 |         if exploration:
69 |             if reparameterize:
70 |                 actions, pretanh = self.dist.rsample(return_pretanh_value=True)
71 |                 return actions, pretanh 
72 |             else:
73 |                 actions = self.dist.sample()
74 |         else:
75 |             actions = torch.tanh(self.mean)
76 |         return actions
77 | 
78 |     def get_log_prob(self, actions, pre_tanh_value):
79 |         log_prob = self.dist.log_prob(actions, pre_tanh_value=pre_tanh_value)
80 |         return log_prob.sum(dim=1, keepdim=True)
81 | 


--------------------------------------------------------------------------------
/rl_algorithms/sac/arguments.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | # define the arguments that will be used in the SAC
 4 | def get_args():
 5 |     parse = argparse.ArgumentParser()
 6 |     parse.add_argument('--env-name', type=str, default='HalfCheetah-v2', help='the environment name')
 7 |     parse.add_argument('--cuda', action='store_true', help='use GPU do the training')
 8 |     parse.add_argument('--seed', type=int, default=123, help='the random seed to reproduce results')
 9 |     parse.add_argument('--hidden-size', type=int, default=256, help='the size of the hidden layer')
10 |     parse.add_argument('--train-loop-per-epoch', type=int, default=1, help='the training loop per epoch')
11 |     parse.add_argument('--q-lr', type=float, default=3e-4, help='the learning rate of the critic')
12 |     parse.add_argument('--p-lr', type=float, default=3e-4, help='the learning rate of the actor')
13 |     parse.add_argument('--n-epochs', type=int, default=int(3e3), help='the number of total epochs')
14 |     parse.add_argument('--epoch-length', type=int, default=int(1e3), help='the lenght of each epoch')
15 |     parse.add_argument('--n-updates', type=int, default=int(1e3), help='the number of training updates execute')
16 |     parse.add_argument('--init-exploration-steps', type=int, default=int(1e3), help='the steps of the initial exploration')
17 |     parse.add_argument('--init-exploration-policy', type=str, default='gaussian', help='the inital exploration policy')
18 |     parse.add_argument('--buffer-size', type=int, default=int(1e6), help='the size of the replay buffer')
19 |     parse.add_argument('--batch-size', type=int, default=256, help='the batch size of samples for training')
20 |     parse.add_argument('--reward-scale', type=float, default=1, help='the reward scale')
21 |     parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor')
22 |     parse.add_argument('--log-std-max', type=float, default=2, help='the maximum log std value')
23 |     parse.add_argument('--log-std-min', type=float, default=-20, help='the minimum log std value')
24 |     parse.add_argument('--entropy-weights', type=float, default=0.2, help='the entropy weights')
25 |     parse.add_argument('--tau', type=float, default=5e-3, help='the soft update coefficient')
26 |     parse.add_argument('--target-update-interval', type=int, default=1, help='the interval to update target network')
27 |     parse.add_argument('--update-cycles', type=int, default=int(1e3), help='how many updates apply in the update')
28 |     parse.add_argument('--eval-episodes', type=int, default=10, help='the episodes that used for evaluation')
29 |     parse.add_argument('--display-interval', type=int, default=1, help='the display interval')
30 |     parse.add_argument('--save-dir', type=str, default='saved_models/', help='the place to save models')
31 |     parse.add_argument('--reg', type=float, default=1e-3, help='the reg term')
32 |     parse.add_argument('--auto-ent-tuning', action='store_true', help='tune the entorpy automatically')
33 |     parse.add_argument('--log-dir', type=str, default='logs', help='dir to save log information')
34 |     parse.add_argument('--env-type', type=str, default=None, help='environment type')
35 | 
36 |     return parse.parse_args()
37 | 


--------------------------------------------------------------------------------
/rl_algorithms/ppo/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import functional as F
  4 | 
  5 | """
  6 | this network also include gaussian distribution and beta distribution
  7 | 
  8 | """
  9 | 
 10 | class mlp_net(nn.Module):
 11 |     def __init__(self, state_size, num_actions, dist_type):
 12 |         super(mlp_net, self).__init__()
 13 |         self.dist_type = dist_type
 14 |         self.fc1_v = nn.Linear(state_size, 64)
 15 |         self.fc2_v = nn.Linear(64, 64)
 16 |         self.fc1_a = nn.Linear(state_size, 64)
 17 |         self.fc2_a = nn.Linear(64, 64)
 18 |         # check the type of distribution
 19 |         if self.dist_type == 'gauss':
 20 |             self.sigma_log = nn.Parameter(torch.zeros(1, num_actions))
 21 |             self.action_mean = nn.Linear(64, num_actions)
 22 |             self.action_mean.weight.data.mul_(0.1)
 23 |             self.action_mean.bias.data.zero_()
 24 |         elif self.dist_type == 'beta':
 25 |             self.action_alpha = nn.Linear(64, num_actions)
 26 |             self.action_beta = nn.Linear(64, num_actions)
 27 |             # init..
 28 |             self.action_alpha.weight.data.mul_(0.1)
 29 |             self.action_alpha.bias.data.zero_()
 30 |             self.action_beta.weight.data.mul_(0.1)
 31 |             self.action_beta.bias.data.zero_()
 32 | 
 33 |         # define layers to output state value
 34 |         self.value = nn.Linear(64, 1)
 35 |         self.value.weight.data.mul_(0.1)
 36 |         self.value.bias.data.zero_()
 37 | 
 38 |     def forward(self, x):
 39 |         x_v = torch.tanh(self.fc1_v(x))
 40 |         x_v = torch.tanh(self.fc2_v(x_v))
 41 |         state_value = self.value(x_v)
 42 |         # output the policy...
 43 |         x_a = torch.tanh(self.fc1_a(x))
 44 |         x_a = torch.tanh(self.fc2_a(x_a))
 45 |         if self.dist_type == 'gauss':
 46 |             mean = self.action_mean(x_a)
 47 |             sigma_log = self.sigma_log.expand_as(mean)
 48 |             sigma = torch.exp(sigma_log)
 49 |             pi = (mean, sigma)
 50 |         elif self.dist_type == 'beta':
 51 |             alpha = F.softplus(self.action_alpha(x_a)) + 1
 52 |             beta = F.softplus(self.action_beta(x_a)) + 1
 53 |             pi = (alpha, beta)
 54 | 
 55 |         return state_value, pi
 56 | 
 57 | # the convolution layer of deepmind
 58 | class deepmind(nn.Module):
 59 |     def __init__(self):
 60 |         super(deepmind, self).__init__()
 61 |         self.conv1 = nn.Conv2d(4, 32, 8, stride=4)
 62 |         self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
 63 |         self.conv3 = nn.Conv2d(64, 32, 3, stride=1)
 64 |         self.fc1 = nn.Linear(32 * 7 * 7, 512) 
 65 |         # start to do the init...
 66 |         nn.init.orthogonal_(self.conv1.weight.data, gain=nn.init.calculate_gain('relu'))
 67 |         nn.init.orthogonal_(self.conv2.weight.data, gain=nn.init.calculate_gain('relu'))
 68 |         nn.init.orthogonal_(self.conv3.weight.data, gain=nn.init.calculate_gain('relu'))
 69 |         nn.init.orthogonal_(self.fc1.weight.data, gain=nn.init.calculate_gain('relu'))
 70 |         # init the bias...
 71 |         nn.init.constant_(self.conv1.bias.data, 0)
 72 |         nn.init.constant_(self.conv2.bias.data, 0)
 73 |         nn.init.constant_(self.conv3.bias.data, 0)
 74 |         nn.init.constant_(self.fc1.bias.data, 0)
 75 |         
 76 |     def forward(self, x):
 77 |         x = F.relu(self.conv1(x))
 78 |         x = F.relu(self.conv2(x))
 79 |         x = F.relu(self.conv3(x))
 80 |         x = x.view(-1, 32 * 7 * 7)
 81 |         x = F.relu(self.fc1(x))
 82 |         return x
 83 | 
 84 | # in the initial, just the nature CNN
 85 | class cnn_net(nn.Module):
 86 |     def __init__(self, num_actions):
 87 |         super(cnn_net, self).__init__()
 88 |         self.cnn_layer = deepmind()
 89 |         self.critic = nn.Linear(512, 1)
 90 |         self.actor = nn.Linear(512, num_actions)
 91 |         # init the linear layer..
 92 |         nn.init.orthogonal_(self.critic.weight.data)
 93 |         nn.init.constant_(self.critic.bias.data, 0)
 94 |         # init the policy layer...
 95 |         nn.init.orthogonal_(self.actor.weight.data, gain=0.01)
 96 |         nn.init.constant_(self.actor.bias.data, 0)
 97 | 
 98 |     def forward(self, inputs):
 99 |         x = self.cnn_layer(inputs / 255.0)
100 |         value = self.critic(x)
101 |         pi = F.softmax(self.actor(x), dim=1)
102 |         return value, pi
103 | 


--------------------------------------------------------------------------------
/rl_utils/logger/plot.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from matplotlib import pyplot as plt
  3 | import seaborn as sns
  4 | from rl_utils.bench import load_results
  5 | 
  6 | sns.set(style="dark")
  7 | sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 2})
  8 | sns.set(rc={"figure.figsize": (15, 8)})
  9 | colors = sns.color_palette(palette='muted')
 10 | 
 11 | 
 12 | X_TIMESTEPS = 'timesteps'
 13 | X_EPISODES = 'episodes'
 14 | X_WALLTIME = 'walltime_hrs'
 15 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
 16 | EPISODES_WINDOW = 150
 17 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
 18 |         'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
 19 |         'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
 20 | 
 21 | def rolling_window(a, window):
 22 |     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
 23 |     strides = a.strides + (a.strides[-1],)
 24 |     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
 25 | 
 26 | def window_func(x, y, window, func):
 27 |     yw = rolling_window(y, window)
 28 |     yw_func = func(yw, axis=-1)
 29 |     return x[window-1:], yw_func
 30 | 
 31 | def ts2xy(ts, xaxis):
 32 |     if xaxis == X_TIMESTEPS:
 33 |         x = np.cumsum(ts.l.values)
 34 |         y = ts.r.values
 35 |     elif xaxis == X_EPISODES:
 36 |         x = np.arange(len(ts))
 37 |         y = ts.r.values
 38 |     elif xaxis == X_WALLTIME:
 39 |         x = ts.t.values / 3600.
 40 |         y = ts.r.values
 41 |     else:
 42 |         raise NotImplementedError
 43 |     return x, y
 44 | 
 45 | def plot_curves(xy_list, xaxis, title, plt_order, beta=False):
 46 |     maxx = max(xy[0][-1] for xy in xy_list)
 47 |     minx = 0
 48 |     if beta == 'dqn':
 49 |         label = ['DQN']
 50 |     elif beta == 'ddqn':
 51 |         label = ['Double-DQN']
 52 |     elif beta == 'dueling':
 53 |         label = ['Dueling-DQN']
 54 |     psub = plt.subplot(plt_order)
 55 |     plt.tight_layout()
 56 |     plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
 57 |     for (i, (x, y)) in enumerate(xy_list):
 58 |         #plt.scatter(x, y, s=2)
 59 |         x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
 60 |         psub.plot(x, y_mean, label=label[i])
 61 |     psub.set_xlim([minx, maxx])
 62 |     psub.set_title(title)
 63 |     psub.legend(loc='best')
 64 |     psub.set_xlabel(xaxis)
 65 |     psub.set_ylabel("rewards")
 66 | 
 67 | def plot_results(dirs, num_timesteps, xaxis, task_name, plt_order, beta=False):
 68 |     tslist = []
 69 |     for dir in dirs:
 70 |         ts = load_results(dir)
 71 |         ts = ts[ts.l.cumsum() <= num_timesteps]
 72 |         tslist.append(ts)
 73 |     xy_list = [ts2xy(ts, xaxis) for ts in tslist]
 74 |     plot_curves(xy_list, xaxis, task_name, plt_order, beta)
 75 | 
 76 | def main():
 77 |     import argparse
 78 |     import os
 79 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 80 |     parser.add_argument('--dirs', help='List of log directories', nargs = '*', default='logs_dqn/')
 81 |     parser.add_argument('--num_timesteps', type=int, default=int(2e7))
 82 |     parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS)
 83 |     parser.add_argument('--task_name', help = 'Title of plot', default = 'BreakoutNoFrameskip-v4')
 84 |     args = parser.parse_args()
 85 |     env_name = ['BankHeistNoFrameskip-v4', 'BreakoutNoFrameskip-v4', 'KangarooNoFrameskip-v4', \
 86 |                 'PongNoFrameskip-v4', 'SeaquestNoFrameskip-v4', 'SpaceInvadersNoFrameskip-v4']
 87 |     dirs = [os.path.abspath(args.dirs + name) for name in env_name] 
 88 |     for idx in range(len(dirs)):
 89 |         plot_results([dirs[idx]], args.num_timesteps, args.xaxis, env_name[idx], 231+idx, beta='dqn')
 90 |     double_dirs = [os.path.abspath('logs_ddqn/' + name) for name in env_name]
 91 |     for idx in range(len(dirs)):
 92 |         plot_results([double_dirs[idx]], args.num_timesteps, args.xaxis, env_name[idx], 231+idx, beta='ddqn')
 93 |     dueling_dirs = [os.path.abspath('logs/' + name) for name in env_name] 
 94 |     for idx in range(len(dirs)):
 95 |         plot_results([dueling_dirs[idx]], args.num_timesteps, args.xaxis, env_name[idx], 231+idx, beta='dueling')
 96 |     plt.savefig("dueling.png")
 97 | 
 98 | if __name__ == '__main__':
 99 |     main()
100 | 
101 | 


--------------------------------------------------------------------------------
/rl_utils/env_wrapper/multi_envs_wrapper.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing as mp
  2 | import numpy as np
  3 | from rl_utils.env_wrapper import VecEnv, CloudpickleWrapper, clear_mpi_env_vars
  4 | 
  5 | def worker(remote, parent_remote, env_fn_wrapper):
  6 |     parent_remote.close()
  7 |     env = env_fn_wrapper.x()
  8 |     try:
  9 |         while True:
 10 |             cmd, data = remote.recv()
 11 |             if cmd == 'step':
 12 |                 ob, reward, done, info = env.step(data)
 13 |                 if done:
 14 |                     ob = env.reset()
 15 |                 remote.send((ob, reward, done, info))
 16 |             elif cmd == 'reset':
 17 |                 ob = env.reset()
 18 |                 remote.send(ob)
 19 |             elif cmd == 'render':
 20 |                 remote.send(env.render(mode='rgb_array'))
 21 |             elif cmd == 'close':
 22 |                 remote.close()
 23 |                 break
 24 |             elif cmd == 'get_spaces_spec':
 25 |                 remote.send((env.observation_space, env.action_space, env.spec))
 26 |             else:
 27 |                 raise NotImplementedError
 28 |     except KeyboardInterrupt:
 29 |         print('SubprocVecEnv worker: got KeyboardInterrupt')
 30 |     finally:
 31 |         env.close()
 32 | 
 33 | 
 34 | class SubprocVecEnv(VecEnv):
 35 |     """
 36 |     VecEnv that runs multiple environments in parallel in subproceses and communicates with them via pipes.
 37 |     Recommended to use when num_envs > 1 and step() can be a bottleneck.
 38 |     """
 39 |     def __init__(self, env_fns, spaces=None, context='spawn'):
 40 |         """
 41 |         Arguments:
 42 | 
 43 |         env_fns: iterable of callables -  functions that create environments to run in subprocesses. Need to be cloud-pickleable
 44 |         """
 45 |         self.waiting = False
 46 |         self.closed = False
 47 |         nenvs = len(env_fns)
 48 |         ctx = mp.get_context(context)
 49 |         self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(nenvs)])
 50 |         self.ps = [ctx.Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
 51 |                    for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
 52 |         for p in self.ps:
 53 |             p.daemon = True  # if the main process crashes, we should not cause things to hang
 54 |             with clear_mpi_env_vars():
 55 |                 p.start()
 56 |         for remote in self.work_remotes:
 57 |             remote.close()
 58 | 
 59 |         self.remotes[0].send(('get_spaces_spec', None))
 60 |         observation_space, action_space, self.spec = self.remotes[0].recv()
 61 |         self.viewer = None
 62 |         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
 63 | 
 64 |     def step_async(self, actions):
 65 |         self._assert_not_closed()
 66 |         for remote, action in zip(self.remotes, actions):
 67 |             remote.send(('step', action))
 68 |         self.waiting = True
 69 | 
 70 |     def step_wait(self):
 71 |         self._assert_not_closed()
 72 |         results = [remote.recv() for remote in self.remotes]
 73 |         self.waiting = False
 74 |         obs, rews, dones, infos = zip(*results)
 75 |         return _flatten_obs(obs), np.stack(rews), np.stack(dones), infos
 76 | 
 77 |     def reset(self):
 78 |         self._assert_not_closed()
 79 |         for remote in self.remotes:
 80 |             remote.send(('reset', None))
 81 |         return _flatten_obs([remote.recv() for remote in self.remotes])
 82 | 
 83 |     def close_extras(self):
 84 |         self.closed = True
 85 |         if self.waiting:
 86 |             for remote in self.remotes:
 87 |                 remote.recv()
 88 |         for remote in self.remotes:
 89 |             remote.send(('close', None))
 90 |         for p in self.ps:
 91 |             p.join()
 92 | 
 93 |     def get_images(self):
 94 |         self._assert_not_closed()
 95 |         for pipe in self.remotes:
 96 |             pipe.send(('render', None))
 97 |         imgs = [pipe.recv() for pipe in self.remotes]
 98 |         return imgs
 99 | 
100 |     def _assert_not_closed(self):
101 |         assert not self.closed, "Trying to operate on a SubprocVecEnv after calling close()"
102 | 
103 |     def __del__(self):
104 |         if not self.closed:
105 |             self.close()
106 | 
107 | def _flatten_obs(obs):
108 |     assert isinstance(obs, (list, tuple))
109 |     assert len(obs) > 0
110 | 
111 |     if isinstance(obs[0], dict):
112 |         keys = obs[0].keys()
113 |         return {k: np.stack([o[k] for o in obs]) for k in keys}
114 |     else:
115 |         return np.stack(obs)
116 | 


--------------------------------------------------------------------------------
/rl_algorithms/dqn_algos/dqn_agent.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import numpy as np
  3 | from models import net
  4 | from utils import linear_schedule, select_actions, reward_recorder
  5 | from rl_utils.experience_replay.experience_replay import replay_buffer
  6 | import torch
  7 | from datetime import datetime
  8 | import os
  9 | import copy
 10 | 
 11 | # define the dqn agent
 12 | class dqn_agent:
 13 |     def __init__(self, env, args):
 14 |         # define some important 
 15 |         self.env = env
 16 |         self.args = args 
 17 |         # define the network
 18 |         self.net = net(self.env.action_space.n, self.args.use_dueling)
 19 |         # copy the self.net as the 
 20 |         self.target_net = copy.deepcopy(self.net)
 21 |         # make sure the target net has the same weights as the network
 22 |         self.target_net.load_state_dict(self.net.state_dict())
 23 |         if self.args.cuda:
 24 |             self.net.cuda()
 25 |             self.target_net.cuda()
 26 |         # define the optimizer
 27 |         self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.args.lr)
 28 |         # define the replay memory
 29 |         self.buffer = replay_buffer(self.args.buffer_size)
 30 |         # define the linear schedule of the exploration
 31 |         self.exploration_schedule = linear_schedule(int(self.args.total_timesteps * self.args.exploration_fraction), \
 32 |                                                     self.args.final_ratio, self.args.init_ratio)
 33 |         # create the folder to save the models
 34 |         if not os.path.exists(self.args.save_dir):
 35 |             os.mkdir(self.args.save_dir)
 36 |         # set the environment folder
 37 |         self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
 38 |         if not os.path.exists(self.model_path):
 39 |             os.mkdir(self.model_path)
 40 | 
 41 |     # start to do the training
 42 |     def learn(self):
 43 |         # the episode reward
 44 |         episode_reward = reward_recorder()
 45 |         obs = np.array(self.env.reset())
 46 |         td_loss = 0
 47 |         for timestep in range(self.args.total_timesteps):
 48 |             explore_eps = self.exploration_schedule.get_value(timestep)
 49 |             with torch.no_grad():
 50 |                 obs_tensor = self._get_tensors(obs)
 51 |                 action_value = self.net(obs_tensor)
 52 |             # select actions
 53 |             action = select_actions(action_value, explore_eps)
 54 |             # excute actions
 55 |             obs_, reward, done, _ = self.env.step(action)
 56 |             obs_ = np.array(obs_)
 57 |             # tryint to append the samples
 58 |             self.buffer.add(obs, action, reward, obs_, float(done))
 59 |             obs = obs_
 60 |             # add the rewards
 61 |             episode_reward.add_rewards(reward)
 62 |             if done:
 63 |                 obs = np.array(self.env.reset())
 64 |                 # start new episode to store rewards
 65 |                 episode_reward.start_new_episode()
 66 |             if timestep > self.args.learning_starts and timestep % self.args.train_freq == 0:
 67 |                 # start to sample the samples from the replay buffer
 68 |                 batch_samples = self.buffer.sample(self.args.batch_size)
 69 |                 td_loss = self._update_network(batch_samples)
 70 |             if timestep > self.args.learning_starts and timestep % self.args.target_network_update_freq == 0:
 71 |                 # update the target network
 72 |                 self.target_net.load_state_dict(self.net.state_dict())
 73 |             if done and episode_reward.num_episodes % self.args.display_interval == 0:
 74 |                 print('[{}] Frames: {}, Episode: {}, Mean: {:.3f}, Loss: {:.3f}'.format(datetime.now(), timestep, episode_reward.num_episodes, \
 75 |                         episode_reward.mean, td_loss))
 76 |                 torch.save(self.net.state_dict(), self.model_path + '/model.pt')
 77 | 
 78 |     # update the network
 79 |     def _update_network(self, samples):
 80 |         obses, actions, rewards, obses_next, dones = samples
 81 |         # convert the data to tensor
 82 |         obses = self._get_tensors(obses)
 83 |         actions = torch.tensor(actions, dtype=torch.int64).unsqueeze(-1)
 84 |         rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(-1)
 85 |         obses_next = self._get_tensors(obses_next)
 86 |         dones = torch.tensor(1 - dones, dtype=torch.float32).unsqueeze(-1)
 87 |         # convert into gpu
 88 |         if self.args.cuda:
 89 |             actions = actions.cuda()
 90 |             rewards = rewards.cuda()
 91 |             dones = dones.cuda()
 92 |         # calculate the target value
 93 |         with torch.no_grad():
 94 |             # if use the double network architecture
 95 |             if self.args.use_double_net:
 96 |                 q_value_ = self.net(obses_next)
 97 |                 action_max_idx = torch.argmax(q_value_, dim=1, keepdim=True)
 98 |                 target_action_value = self.target_net(obses_next)
 99 |                 target_action_max_value = target_action_value.gather(1, action_max_idx)
100 |             else:
101 |                 target_action_value = self.target_net(obses_next)
102 |                 target_action_max_value, _ = torch.max(target_action_value, dim=1, keepdim=True)
103 |         # target
104 |         expected_value = rewards + self.args.gamma * target_action_max_value * dones
105 |         # get the real q value
106 |         action_value = self.net(obses)
107 |         real_value = action_value.gather(1, actions)
108 |         loss = (expected_value - real_value).pow(2).mean()
109 |         # start to update
110 |         self.optimizer.zero_grad()
111 |         loss.backward()
112 |         self.optimizer.step()
113 |         return loss.item()
114 | 
115 |     # get tensors
116 |     def _get_tensors(self, obs):
117 |         if obs.ndim == 3:
118 |             obs = np.transpose(obs, (2, 0, 1))
119 |             obs = np.expand_dims(obs, 0)
120 |         elif obs.ndim == 4:
121 |             obs = np.transpose(obs, (0, 3, 1, 2))
122 |         obs = torch.tensor(obs, dtype=torch.float32)
123 |         if self.args.cuda:
124 |             obs = obs.cuda()
125 |         return obs
126 | 


--------------------------------------------------------------------------------
/rl_utils/logger/bench.py:
--------------------------------------------------------------------------------
  1 | __all__ = ['Monitor', 'get_monitor_files', 'load_results']
  2 | 
  3 | from gym.core import Wrapper
  4 | import time
  5 | from glob import glob
  6 | import csv
  7 | import os.path as osp
  8 | import json
  9 | 
 10 | class Monitor(Wrapper):
 11 |     EXT = "monitor.csv"
 12 |     f = None
 13 | 
 14 |     def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()):
 15 |         Wrapper.__init__(self, env=env)
 16 |         self.tstart = time.time()
 17 |         if filename:
 18 |             self.results_writer = ResultsWriter(filename,
 19 |                 header={"t_start": time.time(), 'env_id' : env.spec and env.spec.id},
 20 |                 extra_keys=reset_keywords + info_keywords
 21 |             )
 22 |         else:
 23 |             self.results_writer = None
 24 |         self.reset_keywords = reset_keywords
 25 |         self.info_keywords = info_keywords
 26 |         self.allow_early_resets = allow_early_resets
 27 |         self.rewards = None
 28 |         self.needs_reset = True
 29 |         self.episode_rewards = []
 30 |         self.episode_lengths = []
 31 |         self.episode_times = []
 32 |         self.total_steps = 0
 33 |         self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()
 34 | 
 35 |     def reset(self, **kwargs):
 36 |         self.reset_state()
 37 |         for k in self.reset_keywords:
 38 |             v = kwargs.get(k)
 39 |             if v is None:
 40 |                 raise ValueError('Expected you to pass kwarg %s into reset'%k)
 41 |             self.current_reset_info[k] = v
 42 |         return self.env.reset(**kwargs)
 43 | 
 44 |     def reset_state(self):
 45 |         if not self.allow_early_resets and not self.needs_reset:
 46 |             raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
 47 |         self.rewards = []
 48 |         self.needs_reset = False
 49 | 
 50 | 
 51 |     def step(self, action):
 52 |         if self.needs_reset:
 53 |             raise RuntimeError("Tried to step environment that needs reset")
 54 |         ob, rew, done, info = self.env.step(action)
 55 |         self.update(ob, rew, done, info)
 56 |         return (ob, rew, done, info)
 57 | 
 58 |     def update(self, ob, rew, done, info):
 59 |         self.rewards.append(rew)
 60 |         if done:
 61 |             self.needs_reset = True
 62 |             eprew = sum(self.rewards)
 63 |             eplen = len(self.rewards)
 64 |             epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
 65 |             for k in self.info_keywords:
 66 |                 epinfo[k] = info[k]
 67 |             self.episode_rewards.append(eprew)
 68 |             self.episode_lengths.append(eplen)
 69 |             self.episode_times.append(time.time() - self.tstart)
 70 |             epinfo.update(self.current_reset_info)
 71 |             if self.results_writer:
 72 |                 self.results_writer.write_row(epinfo)
 73 |             assert isinstance(info, dict)
 74 |             if isinstance(info, dict):
 75 |                 info['episode'] = epinfo
 76 | 
 77 |         self.total_steps += 1
 78 | 
 79 |     def close(self):
 80 |         if self.f is not None:
 81 |             self.f.close()
 82 | 
 83 |     def get_total_steps(self):
 84 |         return self.total_steps
 85 | 
 86 |     def get_episode_rewards(self):
 87 |         return self.episode_rewards
 88 | 
 89 |     def get_episode_lengths(self):
 90 |         return self.episode_lengths
 91 | 
 92 |     def get_episode_times(self):
 93 |         return self.episode_times
 94 | 
 95 | class LoadMonitorResultsError(Exception):
 96 |     pass
 97 | 
 98 | 
 99 | class ResultsWriter(object):
100 |     def __init__(self, filename, header='', extra_keys=()):
101 |         self.extra_keys = extra_keys
102 |         assert filename is not None
103 |         if not filename.endswith(Monitor.EXT):
104 |             if osp.isdir(filename):
105 |                 filename = osp.join(filename, Monitor.EXT)
106 |             else:
107 |                 filename = filename + "." + Monitor.EXT
108 |         self.f = open(filename, "wt")
109 |         if isinstance(header, dict):
110 |             header = '# {} \n'.format(json.dumps(header))
111 |         self.f.write(header)
112 |         self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+tuple(extra_keys))
113 |         self.logger.writeheader()
114 |         self.f.flush()
115 | 
116 |     def write_row(self, epinfo):
117 |         if self.logger:
118 |             self.logger.writerow(epinfo)
119 |             self.f.flush()
120 | 
121 | 
122 | def get_monitor_files(dir):
123 |     return glob(osp.join(dir, "*" + Monitor.EXT))
124 | 
125 | def load_results(dir):
126 |     import pandas
127 |     monitor_files = (
128 |         glob(osp.join(dir, "*monitor.json")) +
129 |         glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
130 |     if not monitor_files:
131 |         raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
132 |     dfs = []
133 |     headers = []
134 |     for fname in monitor_files:
135 |         with open(fname, 'rt') as fh:
136 |             if fname.endswith('csv'):
137 |                 firstline = fh.readline()
138 |                 if not firstline:
139 |                     continue
140 |                 assert firstline[0] == '#'
141 |                 header = json.loads(firstline[1:])
142 |                 df = pandas.read_csv(fh, index_col=None)
143 |                 headers.append(header)
144 |             elif fname.endswith('json'): # Deprecated json format
145 |                 episodes = []
146 |                 lines = fh.readlines()
147 |                 header = json.loads(lines[0])
148 |                 headers.append(header)
149 |                 for line in lines[1:]:
150 |                     episode = json.loads(line)
151 |                     episodes.append(episode)
152 |                 df = pandas.DataFrame(episodes)
153 |             else:
154 |                 assert 0, 'unreachable'
155 |             df['t'] += header['t_start']
156 |         dfs.append(df)
157 |     df = pandas.concat(dfs)
158 |     df.sort_values('t', inplace=True)
159 |     df.reset_index(inplace=True)
160 |     df['t'] -= min(header['t_start'] for header in headers)
161 |     df.headers = headers # HACK to preserve backwards compatibility
162 |     return df
163 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Deep Reinforcement Learning Algorithms
  2 | ![logo](figures/logo.png)  
  3 | &nbsp;  
  4 | ![MIT License](https://img.shields.io/badge/license-MIT-blue.svg)  
  5 | This repository will implement the classic deep reinforcement learning algorithms by using **PyTorch**. The aim of this repository is to provide clear code for people to learn the deep reinforcemen learning algorithms. In the future, more algorithms will be added and the existing codes will also be maintained. 
  6 | ## Current Implementations 
  7 | - [x] Deep Q-Learning Network (DQN)
  8 |     - [x] Basic DQN
  9 |     - [x] Double Q network
 10 |     - [x] Dueling Network Archtiecure
 11 | - [x] Deep Deterministic Policy Gradient (DDPG)
 12 | - [x] Advantage Actor-Critic (A2C)
 13 | - [x] Trust Region Policy Gradient (TRPO)
 14 | - [x] Proximal Policy Optimization (PPO)
 15 | - [ ] Actor Critic using Kronecker-Factored Trust Region (ACKTR)
 16 | - [x] Soft Actor-Critic (SAC)
 17 | ## Update Info
 18 | :triangular_flag_on_post: **2018-10-17** - In this update, most of algorithms have been imporved and **add more experiments with plots** (except for DPPG). The **PPO** now supports **atari-games** and **mujoco-env**. The **TRPO** is much stable and can have better results!  
 19 | &nbsp;  
 20 | :triangular_flag_on_post: **2019-07-15** - In this update, the installation for the openai baseline is no longer needed. I have intergated useful functions in the **rl__utils** module. DDPG is also re-implemented and support more results. README file has been modified. The code structure also has tiny adjustment.  
 21 | &nbsp;  
 22 | :triangular_flag_on_post: **2019-07-26** - In this update, the revised repository will be public. In order to have a light size of the repository. I **rebuild** the repository and the previous version is deleted. But I will make a backup in the google driver.  
 23 | &nbsp;  
 24 | :triangular_flag_on_post: **2019-11-13** - Change the code structure of the repo, all algorithms have been moved to `rl_algorithms/` folder. Add soft actor critic method, the expriments plots will be added soon.
 25 | ## TODO List
 26 | - [ ] add prioritized experience replay.
 27 | - [x] in the future, we will not use openai baseline's pre-processing functions.
 28 | - [x] improve the **DDPG** - I have already implemented a pytorch Hindsight Experience Replay (HER) with DDPG, you chould check them [here](https://github.com/TianhongDai/hindsight-experience-replay).
 29 | - [ ] update pre-trained models in google driver (will update soon!).
 30 | ## Requirments
 31 | - pytorch=1.0.1
 32 | - gym=0.12.5
 33 | - mpi4py
 34 | - mujoco-py
 35 | - opencv-python
 36 | - cloudpickle
 37 | ## Installation
 38 | 1. Install our `rl_utils` module:
 39 | ```bash
 40 | pip install -e .
 41 | ```
 42 | 2. Install mujoco: please follow the instruction of [official website](https://github.com/openai/mujoco-py).
 43 | 3. Install Atari and Box2d:
 44 | ```bash
 45 | sudo apt-get install swig or brew install swig
 46 | pip install gym[atari]
 47 | pip install gym[box2d]
 48 | pip install box2d box2d-kengz
 49 | ```
 50 | ## Instruction
 51 | 1. Train the agent (details could be found in each folder):
 52 | ```
 53 | cd rl_algorithms/<target_algo_folder>/
 54 | python train.py --<arguments you need>
 55 | ```
 56 | 2. Play the demo:
 57 | ```
 58 | cd rl_algorithms/<target_algo_folder>/
 59 | python demo.py --<arguments you need>
 60 | ```
 61 | ## Code Structures
 62 | 1. **rl algorithms**:
 63 |  - `arguments.py`: contain the parameters used in the training.
 64 |  - `<rl-name>_agent.py`: contain the most important part of the reinforcement learning algorithms.
 65 |  - `models.py`: the network structure for the policy and value function.
 66 |  - `utils.py`: some useful function, such as **select actions**.
 67 |  - `train.py`: the script to train the agent.
 68 |  - `demo.py`: visualize the trained agent.
 69 | 2. **rl_utils** module:
 70 |  - `env_wrapper/`: contain the pre-processing function for the atari games and wrapper to create environments.
 71 |  - `experience_replay/`: contain the experience replay for the off-policy rl algorithms.
 72 |  - `logger/`: contain functions to take down log infos during training.
 73 |  - `mpi_utils/`: contain the tools for the mpi training.
 74 |  - `running_filter/`: contain the running mean filter functions to normalize the observation in the mujoco environments.
 75 |  - `seeds/`: contain function to setup the random seeds for the training for reproducibility.
 76 | ## Example Results
 77 | ### 1. DQN algorithms
 78 | ![dqn_performance](figures/01_dqn.png)
 79 | ### 2. DDPG 
 80 | ![dueling_network](figures/02_ddpg.png)
 81 | ### 3. A2C
 82 | ![a2c](figures/03_a2c.png)
 83 | ### 4. TRPO
 84 | ![trpo](figures/04_trpo.png)
 85 | ### 5. PPO
 86 | ![ppo](figures/05_ppo.png)
 87 | ### 6. SAC
 88 | ![sac](figures/06_sac.png)
 89 | 
 90 | ## Demos
 91 | Atari Env (BreakoutNoFrameskip-v4)| Box2d Env (BipedalWalker-v2)| Mujoco Env (Hopper-v2)
 92 | -----------------------|-----------------------|-----------------------|
 93 | ![](figures/breakout.gif)| ![](figures/bipedal.gif)| ![](figures/hopper.gif)
 94 | ## Acknowledgement
 95 | - [Ilya Kostrikov's GitHub](https://github.com/ikostrikov)
 96 | - [Openai Baselines](https://github.com/openai/baselines)
 97 | - [Kai's suggestions to simplify MPI functions](https://github.com/Kaixhin)
 98 | - [rlkit](https://github.com/vitchyr/rlkit)
 99 | 
100 | ## Related Papers
101 | [1] [A Brief Survey of Deep Reinforcement Learning](https://arxiv.org/abs/1708.05866)  
102 | [2] [The Beta Policy for Continuous Control Reinforcement Learning](https://www.ri.cmu.edu/wp-content/uploads/2017/06/thesis-Chou.pdf)  
103 | [3] [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)  
104 | [4] [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)  
105 | [5] [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581)  
106 | [6] [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971)  
107 | [7] [Continuous Deep Q-Learning with Model-based Acceleration](https://arxiv.org/abs/1603.00748)  
108 | [8] [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/abs/1602.01783)  
109 | [9] [Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477)  
110 | [10] [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347)  
111 | [11] [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905)  
112 | [12] [Scalable trust-region method for deep reinforcement learning using Kronecker-factored approximation](https://arxiv.org/abs/1708.05144)  
113 | 


--------------------------------------------------------------------------------
/rl_algorithms/a2c/a2c_agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from models import net
  4 | from datetime import datetime
  5 | from utils import select_actions, evaluate_actions, discount_with_dones
  6 | import os
  7 | 
  8 | class a2c_agent:
  9 |     def __init__(self, envs, args):
 10 |         self.envs = envs
 11 |         self.args = args
 12 |         # define the network
 13 |         self.net = net(self.envs.action_space.n)
 14 |         if self.args.cuda:
 15 |             self.net.cuda()
 16 |         # define the optimizer
 17 |         self.optimizer = torch.optim.RMSprop(self.net.parameters(), lr=self.args.lr, eps=self.args.eps, alpha=self.args.alpha)
 18 |         if not os.path.exists(self.args.save_dir):
 19 |             os.mkdir(self.args.save_dir)
 20 |         # check the saved path for envs..
 21 |         self.model_path = self.args.save_dir + self.args.env_name + '/'
 22 |         if not os.path.exists(self.model_path):
 23 |             os.mkdir(self.model_path)
 24 |         # get the obs..
 25 |         self.batch_ob_shape = (self.args.num_workers * self.args.nsteps,) + self.envs.observation_space.shape
 26 |         self.obs = np.zeros((self.args.num_workers,) + self.envs.observation_space.shape, dtype=self.envs.observation_space.dtype.name)
 27 |         self.obs[:] = self.envs.reset()
 28 |         self.dones = [False for _ in range(self.args.num_workers)]
 29 | 
 30 |     # train the network..
 31 |     def learn(self):
 32 |         num_updates = self.args.total_frames // (self.args.num_workers * self.args.nsteps)
 33 |         # get the reward to calculate other information
 34 |         episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
 35 |         final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
 36 |         # start to update
 37 |         for update in range(num_updates):
 38 |             mb_obs, mb_rewards, mb_actions, mb_dones = [],[],[],[]
 39 |             for step in range(self.args.nsteps):
 40 |                 with torch.no_grad():
 41 |                     input_tensor = self._get_tensors(self.obs)
 42 |                     _, pi = self.net(input_tensor)
 43 |                 # select actions
 44 |                 actions = select_actions(pi)
 45 |                 cpu_actions = actions.squeeze(1).cpu().numpy()
 46 |                 # start to store the information
 47 |                 mb_obs.append(np.copy(self.obs))
 48 |                 mb_actions.append(cpu_actions)
 49 |                 mb_dones.append(self.dones)
 50 |                 # step
 51 |                 obs, rewards, dones, _ = self.envs.step(cpu_actions)
 52 |                 # start to store the rewards
 53 |                 self.dones = dones
 54 |                 mb_rewards.append(rewards)
 55 |                 for n, done in enumerate(dones):
 56 |                     if done:
 57 |                         self.obs[n] = self.obs[n]*0
 58 |                 self.obs = obs
 59 |                 episode_rewards += rewards
 60 |                 # get the masks
 61 |                 masks = np.array([0.0 if done else 1.0 for done in dones], dtype=np.float32)
 62 |                 final_rewards *= masks
 63 |                 final_rewards += (1 - masks) * episode_rewards
 64 |                 episode_rewards *= masks
 65 |                 # update the obs
 66 |             mb_dones.append(self.dones)
 67 |             # process the rollouts
 68 |             mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
 69 |             mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
 70 |             mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
 71 |             mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
 72 |             mb_masks = mb_dones[:, :-1]
 73 |             mb_dones = mb_dones[:, 1:]
 74 |             # calculate the last value
 75 |             with torch.no_grad():
 76 |                 input_tensor = self._get_tensors(self.obs)
 77 |                 last_values, _ = self.net(input_tensor)
 78 |             # compute returns
 79 |             for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values.detach().cpu().numpy().squeeze())):
 80 |                 rewards = rewards.tolist()
 81 |                 dones = dones.tolist()
 82 |                 if dones[-1] == 0:
 83 |                     rewards = discount_with_dones(rewards+[value], dones+[0], self.args.gamma)[:-1]
 84 |                 else:
 85 |                     rewards = discount_with_dones(rewards, dones, self.args.gamma)
 86 |                 mb_rewards[n] = rewards
 87 |             mb_rewards = mb_rewards.flatten()
 88 |             mb_actions = mb_actions.flatten()
 89 |             # start to update network
 90 |             vl, al, ent = self._update_network(mb_obs, mb_rewards, mb_actions)
 91 |             if update % self.args.log_interval == 0:
 92 |                 print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.1f}, VL: {:.3f}, PL: {:.3f}, Ent: {:.2f}, Min: {}, Max:{}'.format(\
 93 |                     datetime.now(), update, num_updates, (update+1)*(self.args.num_workers * self.args.nsteps),\
 94 |                     final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max()))
 95 |                 torch.save(self.net.state_dict(), self.model_path + 'model.pt')
 96 |     
 97 |     # update_network
 98 |     def _update_network(self, obs, returns, actions):
 99 |         # evaluate the actions
100 |         input_tensor = self._get_tensors(obs)
101 |         values, pi = self.net(input_tensor)
102 |         # define the tensor of actions, returns
103 |         returns = torch.tensor(returns, dtype=torch.float32).unsqueeze(1)
104 |         actions = torch.tensor(actions, dtype=torch.int64).unsqueeze(1)
105 |         if self.args.cuda:
106 |             returns = returns.cuda()
107 |             actions = actions.cuda()
108 |         # evaluate actions
109 |         action_log_probs, dist_entropy = evaluate_actions(pi, actions)
110 |         # calculate advantages...
111 |         advantages = returns - values
112 |         # get the value loss
113 |         value_loss = advantages.pow(2).mean()
114 |         # get the action loss
115 |         action_loss = -(advantages.detach() * action_log_probs).mean()
116 |         # total loss
117 |         total_loss = action_loss + self.args.value_loss_coef * value_loss - self.args.entropy_coef * dist_entropy
118 |         # start to update
119 |         self.optimizer.zero_grad()
120 |         total_loss.backward()
121 |         torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.args.max_grad_norm)
122 |         self.optimizer.step()
123 |         return value_loss.item(), action_loss.item(), dist_entropy.item()
124 |     
125 |     # get the tensors...
126 |     def _get_tensors(self, obs):
127 |         input_tensor = torch.tensor(np.transpose(obs, (0, 3, 1, 2)), dtype=torch.float32)
128 |         if self.args.cuda:
129 |             input_tensor = input_tensor.cuda()
130 |         return input_tensor
131 | 


--------------------------------------------------------------------------------
/rl_utils/env_wrapper/__init__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from abc import ABC, abstractmethod
  3 | import contextlib
  4 | 
  5 | class AlreadySteppingError(Exception):
  6 |     """
  7 |     Raised when an asynchronous step is running while
  8 |     step_async() is called again.
  9 |     """
 10 | 
 11 |     def __init__(self):
 12 |         msg = 'already running an async step'
 13 |         Exception.__init__(self, msg)
 14 | 
 15 | 
 16 | class NotSteppingError(Exception):
 17 |     """
 18 |     Raised when an asynchronous step is not running but
 19 |     step_wait() is called.
 20 |     """
 21 | 
 22 |     def __init__(self):
 23 |         msg = 'not running an async step'
 24 |         Exception.__init__(self, msg)
 25 | 
 26 | 
 27 | class VecEnv(ABC):
 28 |     """
 29 |     An abstract asynchronous, vectorized environment.
 30 |     Used to batch data from multiple copies of an environment, so that
 31 |     each observation becomes an batch of observations, and expected action is a batch of actions to
 32 |     be applied per-environment.
 33 |     """
 34 |     closed = False
 35 |     viewer = None
 36 | 
 37 |     metadata = {
 38 |         'render.modes': ['human', 'rgb_array']
 39 |     }
 40 | 
 41 |     def __init__(self, num_envs, observation_space, action_space):
 42 |         self.num_envs = num_envs
 43 |         self.observation_space = observation_space
 44 |         self.action_space = action_space
 45 | 
 46 |     @abstractmethod
 47 |     def reset(self):
 48 |         """
 49 |         Reset all the environments and return an array of
 50 |         observations, or a dict of observation arrays.
 51 | 
 52 |         If step_async is still doing work, that work will
 53 |         be cancelled and step_wait() should not be called
 54 |         until step_async() is invoked again.
 55 |         """
 56 |         pass
 57 | 
 58 |     @abstractmethod
 59 |     def step_async(self, actions):
 60 |         """
 61 |         Tell all the environments to start taking a step
 62 |         with the given actions.
 63 |         Call step_wait() to get the results of the step.
 64 | 
 65 |         You should not call this if a step_async run is
 66 |         already pending.
 67 |         """
 68 |         pass
 69 | 
 70 |     @abstractmethod
 71 |     def step_wait(self):
 72 |         """
 73 |         Wait for the step taken with step_async().
 74 | 
 75 |         Returns (obs, rews, dones, infos):
 76 |          - obs: an array of observations, or a dict of
 77 |                 arrays of observations.
 78 |          - rews: an array of rewards
 79 |          - dones: an array of "episode done" booleans
 80 |          - infos: a sequence of info objects
 81 |         """
 82 |         pass
 83 | 
 84 |     def close_extras(self):
 85 |         """
 86 |         Clean up the  extra resources, beyond what's in this base class.
 87 |         Only runs when not self.closed.
 88 |         """
 89 |         pass
 90 | 
 91 |     def close(self):
 92 |         if self.closed:
 93 |             return
 94 |         if self.viewer is not None:
 95 |             self.viewer.close()
 96 |         self.close_extras()
 97 |         self.closed = True
 98 | 
 99 |     def step(self, actions):
100 |         """
101 |         Step the environments synchronously.
102 | 
103 |         This is available for backwards compatibility.
104 |         """
105 |         self.step_async(actions)
106 |         return self.step_wait()
107 | 
108 |     def render(self, mode='human'):
109 |         raise NotImplementedError
110 | 
111 |     def get_images(self):
112 |         """
113 |         Return RGB images from each environment
114 |         """
115 |         raise NotImplementedError
116 | 
117 |     @property
118 |     def unwrapped(self):
119 |         if isinstance(self, VecEnvWrapper):
120 |             return self.venv.unwrapped
121 |         else:
122 |             return self
123 | 
124 |     def get_viewer(self):
125 |         if self.viewer is None:
126 |             from gym.envs.classic_control import rendering
127 |             self.viewer = rendering.SimpleImageViewer()
128 |         return self.viewer
129 | 
130 | class VecEnvWrapper(VecEnv):
131 |     """
132 |     An environment wrapper that applies to an entire batch
133 |     of environments at once.
134 |     """
135 | 
136 |     def __init__(self, venv, observation_space=None, action_space=None):
137 |         self.venv = venv
138 |         super().__init__(num_envs=venv.num_envs,
139 |                         observation_space=observation_space or venv.observation_space,
140 |                         action_space=action_space or venv.action_space)
141 | 
142 |     def step_async(self, actions):
143 |         self.venv.step_async(actions)
144 | 
145 |     @abstractmethod
146 |     def reset(self):
147 |         pass
148 | 
149 |     @abstractmethod
150 |     def step_wait(self):
151 |         pass
152 | 
153 |     def close(self):
154 |         return self.venv.close()
155 | 
156 |     def render(self, mode='human'):
157 |         return self.venv.render(mode=mode)
158 | 
159 |     def get_images(self):
160 |         return self.venv.get_images()
161 | 
162 |     def __getattr__(self, name):
163 |         if name.startswith('_'):
164 |             raise AttributeError("attempted to get missing private attribute '{}'".format(name))
165 |         return getattr(self.venv, name)
166 | 
167 | class VecEnvObservationWrapper(VecEnvWrapper):
168 |     @abstractmethod
169 |     def process(self, obs):
170 |         pass
171 | 
172 |     def reset(self):
173 |         obs = self.venv.reset()
174 |         return self.process(obs)
175 | 
176 |     def step_wait(self):
177 |         obs, rews, dones, infos = self.venv.step_wait()
178 |         return self.process(obs), rews, dones, infos
179 | 
180 | class CloudpickleWrapper(object):
181 |     """
182 |     Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
183 |     """
184 | 
185 |     def __init__(self, x):
186 |         self.x = x
187 | 
188 |     def __getstate__(self):
189 |         import cloudpickle
190 |         return cloudpickle.dumps(self.x)
191 | 
192 |     def __setstate__(self, ob):
193 |         import pickle
194 |         self.x = pickle.loads(ob)
195 | 
196 | @contextlib.contextmanager
197 | def clear_mpi_env_vars():
198 |     """
199 |     from mpi4py import MPI will call MPI_Init by default.  If the child process has MPI environment variables, MPI will think that the child process is an MPI process just like the parent and do bad things such as hang.
200 |     This context manager is a hacky way to clear those environment variables temporarily such as when we are starting multiprocessing
201 |     Processes.
202 |     """
203 |     removed_environment = {}
204 |     for k, v in list(os.environ.items()):
205 |         for prefix in ['OMPI_', 'PMI_']:
206 |             if k.startswith(prefix):
207 |                 removed_environment[k] = v
208 |                 del os.environ[k]
209 |     try:
210 |         yield
211 |     finally:
212 |         os.environ.update(removed_environment)
213 | 


--------------------------------------------------------------------------------
/rl_algorithms/ddpg/ddpg_agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from models import actor, critic 
  3 | import torch
  4 | import os
  5 | from datetime import datetime
  6 | from mpi4py import MPI
  7 | from rl_utils.mpi_utils.normalizer import normalizer
  8 | from rl_utils.mpi_utils.utils import sync_networks, sync_grads
  9 | from rl_utils.experience_replay.experience_replay import replay_buffer
 10 | from utils import ounoise
 11 | import copy
 12 | import gym
 13 | 
 14 | """
 15 | ddpg algorithms - revised baseline version
 16 | 
 17 | support MPI training
 18 | 
 19 | """
 20 | 
 21 | class ddpg_agent:
 22 |     def __init__(self, env, args):
 23 |         self.env = env
 24 |         self.args = args
 25 |         # get the dims and action max of the environment
 26 |         obs_dims = self.env.observation_space.shape[0]
 27 |         self.action_dims = self.env.action_space.shape[0]
 28 |         self.action_max = self.env.action_space.high[0]
 29 |         # define the network
 30 |         self.actor_net = actor(obs_dims, self.action_dims)
 31 |         self.critic_net = critic(obs_dims, self.action_dims)
 32 |         # sync the weights across the mpi
 33 |         sync_networks(self.actor_net)
 34 |         sync_networks(self.critic_net)
 35 |         # build the target newtork
 36 |         self.actor_target_net = copy.deepcopy(self.actor_net)
 37 |         self.critic_target_net = copy.deepcopy(self.critic_net)
 38 |         # create the optimizer
 39 |         self.actor_optim = torch.optim.Adam(self.actor_net.parameters(), self.args.lr_actor)
 40 |         self.critic_optim = torch.optim.Adam(self.critic_net.parameters(), self.args.lr_critic, weight_decay=self.args.critic_l2_reg)
 41 |         # create the replay buffer
 42 |         self.replay_buffer = replay_buffer(self.args.replay_size)
 43 |         # create the normalizer
 44 |         self.o_norm = normalizer(obs_dims, default_clip_range=self.args.clip_range)
 45 |         # create the noise generator
 46 |         self.noise_generator = ounoise(std=0.2, action_dim=self.action_dims)
 47 |         # create the dir to save models
 48 |         if MPI.COMM_WORLD.Get_rank() == 0:
 49 |             if not os.path.exists(self.args.save_dir):
 50 |                 os.mkdir(self.args.save_dir)
 51 |             self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
 52 |             if not os.path.exists(self.model_path):
 53 |                 os.mkdir(self.model_path)
 54 |         # create a eval environemnt
 55 |         self.eval_env = gym.make(self.args.env_name)
 56 |         # set seeds
 57 |         self.eval_env.seed(self.args.seed * 2 + MPI.COMM_WORLD.Get_rank())
 58 | 
 59 |     def learn(self):
 60 |         """
 61 |         the learning part
 62 | 
 63 |         """
 64 |         self.actor_net.train()
 65 |         # reset the environmenr firstly
 66 |         obs = self.env.reset()
 67 |         self.noise_generator.reset()
 68 |         # get the number of epochs
 69 |         nb_epochs = self.args.total_frames // (self.args.nb_rollout_steps * self.args.nb_cycles)
 70 |         for epoch in range(nb_epochs):
 71 |             for _ in range(self.args.nb_cycles):
 72 |                 # used to update the normalizer
 73 |                 ep_obs = []
 74 |                 for _ in range(self.args.nb_rollout_steps):
 75 |                     with torch.no_grad():
 76 |                         inputs_tensor = self._preproc_inputs(obs)
 77 |                         pi = self.actor_net(inputs_tensor)
 78 |                         action = self._select_actions(pi)
 79 |                     # feed actions into the environment
 80 |                     obs_, reward, done, _ = self.env.step(self.action_max * action)
 81 |                     # append the rollout information into the memory
 82 |                     self.replay_buffer.add(obs, action, reward, obs_, float(done))
 83 |                     ep_obs.append(obs.copy())
 84 |                     obs = obs_
 85 |                     # if done, reset the environment
 86 |                     if done:
 87 |                         obs = self.env.reset()
 88 |                         self.noise_generator.reset()
 89 |                 # then start to do the update of the normalizer
 90 |                 ep_obs = np.array(ep_obs)
 91 |                 self.o_norm.update(ep_obs)
 92 |                 self.o_norm.recompute_stats()
 93 |                 # then start to update the network
 94 |                 for _ in range(self.args.nb_train):
 95 |                     a_loss, c_loss = self._update_network()
 96 |                     # update the target network
 97 |                     self._soft_update_target_network(self.actor_target_net, self.actor_net)
 98 |                     self._soft_update_target_network(self.critic_target_net, self.critic_net)
 99 |             # start to do the evaluation
100 |             success_rate = self._eval_agent()
101 |             # convert back to normal
102 |             self.actor_net.train()
103 |             if epoch % self.args.display_interval == 0:
104 |                 if MPI.COMM_WORLD.Get_rank() == 0:
105 |                     print('[{}] Epoch: {} / {}, Frames: {}, Rewards: {:.3f}, Actor loss: {:.3f}, Critic Loss: {:.3f}'.format(datetime.now(), \
106 |                             epoch, nb_epochs, (epoch+1) * self.args.nb_rollout_steps * self.args.nb_cycles, success_rate, a_loss, c_loss))
107 |                     torch.save([self.actor_net.state_dict(), self.o_norm.mean, self.o_norm.std], self.model_path + '/model.pt')
108 | 
109 |     # functions to preprocess the image
110 |     def _preproc_inputs(self, obs):
111 |         obs_norm = self.o_norm.normalize(obs)
112 |         inputs_tensor = torch.tensor(obs_norm, dtype=torch.float32).unsqueeze(0)
113 |         return inputs_tensor
114 | 
115 |     # this function will choose action for the agent and do the exploration
116 |     def _select_actions(self, pi):
117 |         action = pi.cpu().numpy().squeeze()
118 |         # TODO: Noise type now - only support ounoise
119 |         # add the gaussian noise
120 |         #action = action + np.random.normal(0, 0.1, self.action_dims)
121 |         # add ou noise
122 |         action = action + self.noise_generator.noise()
123 |         action = np.clip(action, -1, 1)
124 |         return action
125 |     
126 |     # update the network
127 |     def _update_network(self):
128 |         # sample the samples from the replay buffer
129 |         samples = self.replay_buffer.sample(self.args.batch_size)
130 |         obses, actions, rewards, obses_next, dones = samples
131 |         # try to do the normalization of obses
132 |         norm_obses = self.o_norm.normalize(obses)
133 |         norm_obses_next = self.o_norm.normalize(obses_next)
134 |         # transfer them into tensors
135 |         norm_obses_tensor = torch.tensor(norm_obses, dtype=torch.float32)
136 |         norm_obses_next_tensor = torch.tensor(norm_obses_next, dtype=torch.float32)
137 |         actions_tensor = torch.tensor(actions, dtype=torch.float32)
138 |         rewards_tensor = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1)
139 |         dones_tensor = torch.tensor(dones, dtype=torch.float32).unsqueeze(1)
140 |         with torch.no_grad():
141 |             actions_next = self.actor_target_net(norm_obses_next_tensor)
142 |             q_next_value = self.critic_target_net(norm_obses_next_tensor, actions_next)
143 |             target_q_value = rewards_tensor + (1 - dones_tensor) * self.args.gamma * q_next_value
144 |         # the real q value
145 |         real_q_value = self.critic_net(norm_obses_tensor, actions_tensor)
146 |         critic_loss = (real_q_value - target_q_value).pow(2).mean()
147 |         # the actor loss
148 |         actions_real = self.actor_net(norm_obses_tensor)
149 |         actor_loss = -self.critic_net(norm_obses_tensor, actions_real).mean()
150 |         # start to update the network
151 |         self.actor_optim.zero_grad()
152 |         actor_loss.backward()
153 |         sync_grads(self.actor_net)
154 |         self.actor_optim.step()
155 |         # update the critic network
156 |         self.critic_optim.zero_grad()
157 |         critic_loss.backward()
158 |         sync_grads(self.critic_net)
159 |         self.critic_optim.step()
160 |         return actor_loss.item(), critic_loss.item()
161 |     
162 |     # soft update the target network...
163 |     def _soft_update_target_network(self, target, source):
164 |         for target_param, param in zip(target.parameters(), source.parameters()):
165 |             target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data)
166 |     
167 |     # do the evaluation
168 |     def _eval_agent(self):
169 |         self.actor_net.eval()
170 |         total_success_rate = []
171 |         for _ in range(self.args.nb_test_rollouts):
172 |             per_success_rate = []
173 |             obs = self.eval_env.reset()
174 |             while True:
175 |                 with torch.no_grad():
176 |                     inputs_tensor = self._preproc_inputs(obs)
177 |                     pi = self.actor_net(inputs_tensor)
178 |                     actions = pi.detach().cpu().numpy().squeeze()
179 |                     if self.action_dims == 1:
180 |                         actions = np.array([actions])
181 |                 obs_, reward, done, _ = self.eval_env.step(actions * self.action_max)
182 |                 per_success_rate.append(reward)
183 |                 obs = obs_
184 |                 if done:
185 |                     break
186 |             total_success_rate.append(np.sum(per_success_rate))
187 |         local_success_rate = np.mean(total_success_rate)
188 |         global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM)
189 |         return global_success_rate / MPI.COMM_WORLD.Get_size()
190 | 


--------------------------------------------------------------------------------
/rl_algorithms/trpo/trpo_agent.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import os
  4 | from models import network
  5 | from rl_utils.running_filter.running_filter import ZFilter
  6 | from utils import select_actions, eval_actions, conjugated_gradient, line_search, set_flat_params_to
  7 | from datetime import datetime
  8 | 
  9 | class trpo_agent:
 10 |     def __init__(self, env, args):
 11 |         self.env = env
 12 |         self.args = args
 13 |         # define the network
 14 |         self.net = network(self.env.observation_space.shape[0], self.env.action_space.shape[0])
 15 |         self.old_net = network(self.env.observation_space.shape[0], self.env.action_space.shape[0])
 16 |         # make sure the net and old net have the same parameters
 17 |         self.old_net.load_state_dict(self.net.state_dict())
 18 |         # define the optimizer
 19 |         self.optimizer = torch.optim.Adam(self.net.critic.parameters(), lr=self.args.lr)
 20 |         # define the running mean filter
 21 |         self.running_state = ZFilter((self.env.observation_space.shape[0],), clip=5)
 22 |         if not os.path.exists(self.args.save_dir):
 23 |             os.mkdir(self.args.save_dir)
 24 |         self.model_path = self.args.save_dir + self.args.env_name + '/'
 25 |         if not os.path.exists(self.model_path):
 26 |             os.mkdir(self.model_path)
 27 | 
 28 |     def learn(self):
 29 |         num_updates = self.args.total_timesteps // self.args.nsteps
 30 |         obs = self.running_state(self.env.reset())
 31 |         final_reward = 0
 32 |         episode_reward = 0
 33 |         self.dones = False
 34 |         for update in range(num_updates):
 35 |             mb_obs, mb_rewards, mb_actions, mb_dones, mb_values = [], [], [], [], []
 36 |             for step in range(self.args.nsteps):
 37 |                 with torch.no_grad():
 38 |                     obs_tensor = self._get_tensors(obs)
 39 |                     value, pi = self.net(obs_tensor)
 40 |                 # select actions
 41 |                 actions = select_actions(pi)
 42 |                 # store informations
 43 |                 mb_obs.append(np.copy(obs))
 44 |                 mb_actions.append(actions)
 45 |                 mb_dones.append(self.dones)
 46 |                 mb_values.append(value.detach().numpy().squeeze())
 47 |                 # start to execute actions in the environment
 48 |                 obs_, reward, done, _ = self.env.step(actions)
 49 |                 self.dones = done
 50 |                 mb_rewards.append(reward)
 51 |                 if done:
 52 |                     obs_ = self.env.reset()
 53 |                 obs = self.running_state(obs_)
 54 |                 episode_reward += reward
 55 |                 mask = 0.0 if done else 1.0
 56 |                 final_reward *= mask
 57 |                 final_reward += (1 - mask) * episode_reward
 58 |                 episode_reward *= mask
 59 |             # to process the rollouts
 60 |             mb_obs = np.asarray(mb_obs, dtype=np.float32)
 61 |             mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
 62 |             mb_actions = np.asarray(mb_actions, dtype=np.float32)
 63 |             mb_dones = np.asarray(mb_dones, dtype=np.bool)
 64 |             mb_values = np.asarray(mb_values, dtype=np.float32)
 65 |             # compute the last state value
 66 |             with torch.no_grad():
 67 |                 obs_tensor = self._get_tensors(obs)
 68 |                 last_value, _ = self.net(obs_tensor)
 69 |                 last_value = last_value.detach().numpy().squeeze()
 70 |             # compute the advantages
 71 |             mb_returns = np.zeros_like(mb_rewards)
 72 |             mb_advs = np.zeros_like(mb_rewards)
 73 |             lastgaelam = 0
 74 |             for t in reversed(range(self.args.nsteps)):
 75 |                 if t == self.args.nsteps - 1:
 76 |                     nextnonterminal = 1.0 - self.dones
 77 |                     nextvalues = last_value
 78 |                 else:
 79 |                     nextnonterminal = 1.0 - mb_dones[t + 1]
 80 |                     nextvalues = mb_values[t + 1]
 81 |                 delta = mb_rewards[t] + self.args.gamma * nextvalues * nextnonterminal - mb_values[t]
 82 |                 mb_advs[t] = lastgaelam = delta + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam
 83 |             mb_returns = mb_advs + mb_values
 84 |             # normalize the advantages
 85 |             mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-5)
 86 |             # before the update, make the old network has the parameter of the current network
 87 |             self.old_net.load_state_dict(self.net.state_dict())
 88 |             # start to update the network
 89 |             policy_loss, value_loss = self._update_network(mb_obs, mb_actions, mb_returns, mb_advs)
 90 |             torch.save([self.net.state_dict(), self.running_state], self.model_path + 'model.pt')
 91 |             print('[{}] Update: {} / {}, Frames: {}, Reward: {:.3f}, VL: {:.3f}, PL: {:.3f}'.format(datetime.now(), update, \
 92 |                     num_updates, (update + 1)*self.args.nsteps, final_reward, value_loss, policy_loss))
 93 | 
 94 |     # start to update network
 95 |     def _update_network(self, mb_obs, mb_actions, mb_returns, mb_advs):
 96 |         mb_obs_tensor = torch.tensor(mb_obs, dtype=torch.float32)
 97 |         mb_actions_tensor = torch.tensor(mb_actions, dtype=torch.float32)
 98 |         mb_returns_tensor = torch.tensor(mb_returns, dtype=torch.float32).unsqueeze(1)
 99 |         mb_advs_tensor = torch.tensor(mb_advs, dtype=torch.float32).unsqueeze(1)
100 |         # try to get the old policy and current policy
101 |         values, _ = self.net(mb_obs_tensor)
102 |         with torch.no_grad():
103 |             _, pi_old = self.old_net(mb_obs_tensor)
104 |         # get the surr loss
105 |         surr_loss = self._get_surrogate_loss(mb_obs_tensor, mb_advs_tensor, mb_actions_tensor, pi_old)
106 |         # comupte the surrogate gardient -> g, Ax = g, where A is the fisher information matrix
107 |         surr_grad = torch.autograd.grad(surr_loss, self.net.actor.parameters())
108 |         flat_surr_grad = torch.cat([grad.view(-1) for grad in surr_grad]).data
109 |         # use the conjugated gradient to calculate the scaled direction vector (natural gradient)
110 |         nature_grad = conjugated_gradient(self._fisher_vector_product, -flat_surr_grad, 10, mb_obs_tensor, pi_old)
111 |         # calculate the scaleing ratio
112 |         non_scale_kl = 0.5 * (nature_grad * self._fisher_vector_product(nature_grad, mb_obs_tensor, pi_old)).sum(0, keepdim=True)
113 |         scale_ratio = torch.sqrt(non_scale_kl / self.args.max_kl)
114 |         final_nature_grad = nature_grad / scale_ratio[0]
115 |         # calculate the expected improvement rate...
116 |         expected_improve = (-flat_surr_grad * nature_grad).sum(0, keepdim=True) / scale_ratio[0]
117 |         # get the flat param ...
118 |         prev_params = torch.cat([param.data.view(-1) for param in self.net.actor.parameters()])
119 |         # start to do the line search
120 |         success, new_params = line_search(self.net.actor, self._get_surrogate_loss, prev_params, final_nature_grad, \
121 |                                 expected_improve, mb_obs_tensor, mb_advs_tensor, mb_actions_tensor, pi_old)
122 |         set_flat_params_to(self.net.actor, new_params)
123 |         # then trying to update the critic network
124 |         inds = np.arange(mb_obs.shape[0])
125 |         for _ in range(self.args.vf_itrs):
126 |             np.random.shuffle(inds)
127 |             for start in range(0, mb_obs.shape[0], self.args.batch_size):
128 |                 end = start + self.args.batch_size
129 |                 mbinds = inds[start:end]
130 |                 mini_obs = mb_obs[mbinds]
131 |                 mini_returns = mb_returns[mbinds]
132 |                 # put things in the tensor
133 |                 mini_obs = torch.tensor(mini_obs, dtype=torch.float32)
134 |                 mini_returns = torch.tensor(mini_returns, dtype=torch.float32).unsqueeze(1)
135 |                 values, _ = self.net(mini_obs)
136 |                 v_loss = (mini_returns - values).pow(2).mean()
137 |                 self.optimizer.zero_grad()
138 |                 v_loss.backward()
139 |                 self.optimizer.step()
140 |         return surr_loss.item(), v_loss.item()
141 | 
142 |     # get the surrogate loss
143 |     def _get_surrogate_loss(self, obs, adv, actions, pi_old):
144 |         _, pi = self.net(obs)
145 |         log_prob = eval_actions(pi, actions)
146 |         old_log_prob = eval_actions(pi_old, actions).detach()
147 |         surr_loss = -torch.exp(log_prob - old_log_prob) * adv
148 |         return surr_loss.mean()
149 | 
150 |     # the product of the fisher informaiton matrix and the nature gradient -> Ax
151 |     def _fisher_vector_product(self, v, obs, pi_old):
152 |         kl = self._get_kl(obs, pi_old)
153 |         kl = kl.mean()
154 |         # start to calculate the second order gradient of the KL
155 |         kl_grads = torch.autograd.grad(kl, self.net.actor.parameters(), create_graph=True)
156 |         flat_kl_grads = torch.cat([grad.view(-1) for grad in kl_grads])
157 |         kl_v = (flat_kl_grads * torch.autograd.Variable(v)).sum()
158 |         kl_second_grads = torch.autograd.grad(kl_v, self.net.actor.parameters())
159 |         flat_kl_second_grads = torch.cat([grad.contiguous().view(-1) for grad in kl_second_grads]).data
160 |         flat_kl_second_grads = flat_kl_second_grads + self.args.damping * v
161 |         return flat_kl_second_grads
162 | 
163 |     # get the kl divergence between two distributions
164 |     def _get_kl(self, obs, pi_old):
165 |         mean_old, std_old = pi_old
166 |         _, pi = self.net(obs)
167 |         mean, std = pi
168 |         # start to calculate the kl-divergence
169 |         kl = -torch.log(std / std_old) + (std.pow(2) + (mean - mean_old).pow(2)) / (2 * std_old.pow(2)) - 0.5
170 |         return kl.sum(1, keepdim=True)
171 |  
172 |     # get the tensors
173 |     def _get_tensors(self, obs):
174 |         return torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
175 | 


--------------------------------------------------------------------------------
/rl_algorithms/sac/sac_agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from models import flatten_mlp, tanh_gaussian_actor
  4 | from rl_utils.experience_replay.experience_replay import replay_buffer
  5 | from utils import get_action_info
  6 | from datetime import datetime
  7 | import copy
  8 | import os
  9 | import gym
 10 | 
 11 | 
 12 | """
 13 | 2019-Nov-12 - start to add the automatically tempature tuning
 14 | 
 15 | 2019-JUN-05
 16 | 
 17 | author: Tianhong Dai
 18 | 
 19 | """
 20 | 
 21 | # the soft-actor-critic agent
 22 | class sac_agent:
 23 |     def __init__(self, env, args):
 24 |         self.args = args
 25 |         self.env = env
 26 |         # create eval environment
 27 |         self.eval_env = gym.make(self.args.env_name)
 28 |         self.eval_env.seed(args.seed * 2)
 29 |         # build up the network that will be used.
 30 |         self.qf1 = flatten_mlp(self.env.observation_space.shape[0], self.args.hidden_size, self.env.action_space.shape[0])
 31 |         self.qf2 = flatten_mlp(self.env.observation_space.shape[0], self.args.hidden_size, self.env.action_space.shape[0])
 32 |         # set the target q functions
 33 |         self.target_qf1 = copy.deepcopy(self.qf1)
 34 |         self.target_qf2 = copy.deepcopy(self.qf2)
 35 |         # build up the policy network
 36 |         self.actor_net = tanh_gaussian_actor(self.env.observation_space.shape[0], self.env.action_space.shape[0], self.args.hidden_size, \
 37 |                                             self.args.log_std_min, self.args.log_std_max)
 38 |         # define the optimizer for them
 39 |         self.qf1_optim = torch.optim.Adam(self.qf1.parameters(), lr=self.args.q_lr)
 40 |         self.qf2_optim = torch.optim.Adam(self.qf2.parameters(), lr=self.args.q_lr)
 41 |         # the optimizer for the policy network
 42 |         self.actor_optim = torch.optim.Adam(self.actor_net.parameters(), lr=self.args.p_lr)
 43 |         # entorpy target
 44 |         self.target_entropy = -np.prod(self.env.action_space.shape).item()
 45 |         self.log_alpha = torch.zeros(1, requires_grad=True, device='cuda' if self.args.cuda else 'cpu')
 46 |         # define the optimizer
 47 |         self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self.args.p_lr)
 48 |         # define the replay buffer
 49 |         self.buffer = replay_buffer(self.args.buffer_size)
 50 |         # get the action max
 51 |         self.action_max = self.env.action_space.high[0]
 52 |         # if use cuda, put tensor onto the gpu
 53 |         if self.args.cuda:
 54 |             self.actor_net.cuda()
 55 |             self.qf1.cuda()
 56 |             self.qf2.cuda()
 57 |             self.target_qf1.cuda()
 58 |             self.target_qf2.cuda()
 59 |         # automatically create the folders to save models
 60 |         if not os.path.exists(self.args.save_dir):
 61 |             os.mkdir(self.args.save_dir)
 62 |         self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
 63 |         if not os.path.exists(self.model_path):
 64 |             os.mkdir(self.model_path)
 65 | 
 66 |     # train the agent
 67 |     def learn(self):
 68 |         global_timesteps = 0
 69 |         # before the official training, do the initial exploration to add episodes into the replay buffer
 70 |         self._initial_exploration(exploration_policy=self.args.init_exploration_policy) 
 71 |         # reset the environment
 72 |         obs = self.env.reset()
 73 |         for epoch in range(self.args.n_epochs):
 74 |             for _ in range(self.args.train_loop_per_epoch):
 75 |                 # for each epoch, it will reset the environment
 76 |                 for t in range(self.args.epoch_length):
 77 |                     # start to collect samples
 78 |                     with torch.no_grad():
 79 |                         obs_tensor = self._get_tensor_inputs(obs)
 80 |                         pi = self.actor_net(obs_tensor)
 81 |                         action = get_action_info(pi, cuda=self.args.cuda).select_actions(reparameterize=False)
 82 |                         action = action.cpu().numpy()[0]
 83 |                     # input the actions into the environment
 84 |                     obs_, reward, done, _ = self.env.step(self.action_max * action)
 85 |                     # store the samples
 86 |                     self.buffer.add(obs, action, reward, obs_, float(done))
 87 |                     # reassign the observations
 88 |                     obs = obs_
 89 |                     if done:
 90 |                         # reset the environment
 91 |                         obs = self.env.reset()
 92 |                 # after collect the samples, start to update the network
 93 |                 for _ in range(self.args.update_cycles):
 94 |                     qf1_loss, qf2_loss, actor_loss, alpha, alpha_loss = self._update_newtork()
 95 |                     # update the target network
 96 |                     if global_timesteps % self.args.target_update_interval == 0:
 97 |                         self._update_target_network(self.target_qf1, self.qf1)
 98 |                         self._update_target_network(self.target_qf2, self.qf2)
 99 |                     global_timesteps += 1
100 |             # print the log information
101 |             if epoch % self.args.display_interval == 0:
102 |                 # start to do the evaluation
103 |                 mean_rewards = self._evaluate_agent()
104 |                 print('[{}] Epoch: {} / {}, Frames: {}, Rewards: {:.3f}, QF1: {:.3f}, QF2: {:.3f}, AL: {:.3f}, Alpha: {:.5f}, AlphaL: {:.5f}'.format(datetime.now(), \
105 |                             epoch, self.args.n_epochs, (epoch + 1) * self.args.epoch_length, mean_rewards, qf1_loss, qf2_loss, actor_loss, alpha, alpha_loss))
106 |                 # save models
107 |                 torch.save(self.actor_net.state_dict(), self.model_path + '/model.pt')
108 |     
109 |     # do the initial exploration by using the uniform policy
110 |     def _initial_exploration(self, exploration_policy='gaussian'):
111 |         # get the action information of the environment
112 |         obs = self.env.reset()
113 |         for _ in range(self.args.init_exploration_steps):
114 |             if exploration_policy == 'uniform':
115 |                 raise NotImplementedError
116 |             elif exploration_policy == 'gaussian':
117 |                 # the sac does not need normalize?
118 |                 with torch.no_grad():
119 |                     obs_tensor = self._get_tensor_inputs(obs)
120 |                     # generate the policy
121 |                     pi = self.actor_net(obs_tensor)
122 |                     action = get_action_info(pi).select_actions(reparameterize=False)
123 |                     action = action.cpu().numpy()[0]
124 |                 # input the action input the environment
125 |                 obs_, reward, done, _ = self.env.step(self.action_max * action)
126 |                 # store the episodes
127 |                 self.buffer.add(obs, action, reward, obs_, float(done))
128 |                 obs = obs_
129 |                 if done:
130 |                     # if done, reset the environment
131 |                     obs = self.env.reset()
132 |         print("Initial exploration has been finished!")
133 |     # get tensors
134 |     def _get_tensor_inputs(self, obs):
135 |         obs_tensor = torch.tensor(obs, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu').unsqueeze(0)
136 |         return obs_tensor
137 |     
138 |     # update the network
139 |     def _update_newtork(self):
140 |         # smaple batch of samples from the replay buffer
141 |         obses, actions, rewards, obses_, dones = self.buffer.sample(self.args.batch_size)
142 |         # preprocessing the data into the tensors, will support GPU later
143 |         obses = torch.tensor(obses, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu')
144 |         actions = torch.tensor(actions, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu')
145 |         rewards = torch.tensor(rewards, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu').unsqueeze(-1)
146 |         obses_ = torch.tensor(obses_, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu')
147 |         inverse_dones = torch.tensor(1 - dones, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu').unsqueeze(-1)
148 |         # start to update the actor network
149 |         pis = self.actor_net(obses)
150 |         actions_info = get_action_info(pis, cuda=self.args.cuda)
151 |         actions_, pre_tanh_value = actions_info.select_actions(reparameterize=True)
152 |         log_prob = actions_info.get_log_prob(actions_, pre_tanh_value)
153 |         # use the automatically tuning
154 |         alpha_loss = -(self.log_alpha * (log_prob + self.target_entropy).detach()).mean()
155 |         self.alpha_optim.zero_grad()
156 |         alpha_loss.backward()
157 |         self.alpha_optim.step()
158 |         # get the param
159 |         alpha = self.log_alpha.exp()
160 |         # get the q_value for new actions
161 |         q_actions_ = torch.min(self.qf1(obses, actions_), self.qf2(obses, actions_))
162 |         actor_loss = (alpha * log_prob - q_actions_).mean()
163 |         # q value function loss
164 |         q1_value = self.qf1(obses, actions)
165 |         q2_value = self.qf2(obses, actions)
166 |         with torch.no_grad():
167 |             pis_next = self.actor_net(obses_)
168 |             actions_info_next = get_action_info(pis_next, cuda=self.args.cuda)
169 |             actions_next_, pre_tanh_value_next = actions_info_next.select_actions(reparameterize=True)
170 |             log_prob_next = actions_info_next.get_log_prob(actions_next_, pre_tanh_value_next)
171 |             target_q_value_next = torch.min(self.target_qf1(obses_, actions_next_), self.target_qf2(obses_, actions_next_)) - alpha * log_prob_next
172 |             target_q_value = self.args.reward_scale * rewards + inverse_dones * self.args.gamma * target_q_value_next 
173 |         qf1_loss = (q1_value - target_q_value).pow(2).mean()
174 |         qf2_loss = (q2_value - target_q_value).pow(2).mean()
175 |         # qf1
176 |         self.qf1_optim.zero_grad()
177 |         qf1_loss.backward()
178 |         self.qf1_optim.step()
179 |         # qf2
180 |         self.qf2_optim.zero_grad()
181 |         qf2_loss.backward()
182 |         self.qf2_optim.step()
183 |         # policy loss
184 |         self.actor_optim.zero_grad()
185 |         actor_loss.backward()
186 |         self.actor_optim.step()
187 |         return qf1_loss.item(), qf2_loss.item(), actor_loss.item(), alpha.item(), alpha_loss.item()
188 |     
189 |     # update the target network
190 |     def _update_target_network(self, target, source):
191 |         for target_param, param in zip(target.parameters(), source.parameters()):
192 |             target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data)
193 | 
194 |     # evaluate the agent
195 |     def _evaluate_agent(self):
196 |         total_reward = 0
197 |         for _ in range(self.args.eval_episodes):
198 |             obs = self.eval_env.reset()
199 |             episode_reward = 0 
200 |             while True:
201 |                 with torch.no_grad():
202 |                     obs_tensor = self._get_tensor_inputs(obs)
203 |                     pi = self.actor_net(obs_tensor)
204 |                     action = get_action_info(pi, cuda=self.args.cuda).select_actions(exploration=False, reparameterize=False)
205 |                     action = action.detach().cpu().numpy()[0]
206 |                 # input the action into the environment
207 |                 obs_, reward, done, _ = self.eval_env.step(self.action_max * action)
208 |                 episode_reward += reward
209 |                 if done:
210 |                     break
211 |                 obs = obs_
212 |             total_reward += episode_reward
213 |         return total_reward / self.args.eval_episodes
214 | 


--------------------------------------------------------------------------------
/rl_utils/env_wrapper/atari_wrapper.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | os.environ.setdefault('PATH', '')
  4 | from collections import deque
  5 | import gym
  6 | from gym import spaces
  7 | import cv2
  8 | cv2.ocl.setUseOpenCL(False)
  9 | 
 10 | """
 11 | the wrapper is taken from the openai baselines
 12 | 
 13 | """
 14 | 
 15 | class NoopResetEnv(gym.Wrapper):
 16 |     def __init__(self, env, noop_max=30):
 17 |         """Sample initial states by taking random number of no-ops on reset.
 18 |         No-op is assumed to be action 0.
 19 |         """
 20 |         gym.Wrapper.__init__(self, env)
 21 |         self.noop_max = noop_max
 22 |         self.override_num_noops = None
 23 |         self.noop_action = 0
 24 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 25 | 
 26 |     def reset(self, **kwargs):
 27 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 28 |         self.env.reset(**kwargs)
 29 |         if self.override_num_noops is not None:
 30 |             noops = self.override_num_noops
 31 |         else:
 32 |             noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
 33 |         assert noops > 0
 34 |         obs = None
 35 |         for _ in range(noops):
 36 |             obs, _, done, _ = self.env.step(self.noop_action)
 37 |             if done:
 38 |                 obs = self.env.reset(**kwargs)
 39 |         return obs
 40 | 
 41 |     def step(self, ac):
 42 |         return self.env.step(ac)
 43 | 
 44 | class FireResetEnv(gym.Wrapper):
 45 |     def __init__(self, env):
 46 |         """Take action on reset for environments that are fixed until firing."""
 47 |         gym.Wrapper.__init__(self, env)
 48 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 49 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 50 | 
 51 |     def reset(self, **kwargs):
 52 |         self.env.reset(**kwargs)
 53 |         obs, _, done, _ = self.env.step(1)
 54 |         if done:
 55 |             self.env.reset(**kwargs)
 56 |         obs, _, done, _ = self.env.step(2)
 57 |         if done:
 58 |             self.env.reset(**kwargs)
 59 |         return obs
 60 | 
 61 |     def step(self, ac):
 62 |         return self.env.step(ac)
 63 | 
 64 | class EpisodicLifeEnv(gym.Wrapper):
 65 |     def __init__(self, env):
 66 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 67 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 68 |         """
 69 |         gym.Wrapper.__init__(self, env)
 70 |         self.lives = 0
 71 |         self.was_real_done  = True
 72 | 
 73 |     def step(self, action):
 74 |         obs, reward, done, info = self.env.step(action)
 75 |         self.was_real_done = done
 76 |         # check current lives, make loss of life terminal,
 77 |         # then update lives to handle bonus lives
 78 |         lives = self.env.unwrapped.ale.lives()
 79 |         if lives < self.lives and lives > 0:
 80 |             # for Qbert sometimes we stay in lives == 0 condition for a few frames
 81 |             # so it's important to keep lives > 0, so that we only reset once
 82 |             # the environment advertises done.
 83 |             done = True
 84 |         self.lives = lives
 85 |         return obs, reward, done, info
 86 | 
 87 |     def reset(self, **kwargs):
 88 |         """Reset only when lives are exhausted.
 89 |         This way all states are still reachable even though lives are episodic,
 90 |         and the learner need not know about any of this behind-the-scenes.
 91 |         """
 92 |         if self.was_real_done:
 93 |             obs = self.env.reset(**kwargs)
 94 |         else:
 95 |             # no-op step to advance from terminal/lost life state
 96 |             obs, _, _, _ = self.env.step(0)
 97 |         self.lives = self.env.unwrapped.ale.lives()
 98 |         return obs
 99 | 
100 | class MaxAndSkipEnv(gym.Wrapper):
101 |     def __init__(self, env, skip=4):
102 |         """Return only every `skip`-th frame"""
103 |         gym.Wrapper.__init__(self, env)
104 |         # most recent raw observations (for max pooling across time steps)
105 |         self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
106 |         self._skip       = skip
107 | 
108 |     def step(self, action):
109 |         """Repeat action, sum reward, and max over last observations."""
110 |         total_reward = 0.0
111 |         done = None
112 |         for i in range(self._skip):
113 |             obs, reward, done, info = self.env.step(action)
114 |             if i == self._skip - 2: self._obs_buffer[0] = obs
115 |             if i == self._skip - 1: self._obs_buffer[1] = obs
116 |             total_reward += reward
117 |             if done:
118 |                 break
119 |         # Note that the observation on the done=True frame
120 |         # doesn't matter
121 |         max_frame = self._obs_buffer.max(axis=0)
122 | 
123 |         return max_frame, total_reward, done, info
124 | 
125 |     def reset(self, **kwargs):
126 |         return self.env.reset(**kwargs)
127 | 
128 | class ClipRewardEnv(gym.RewardWrapper):
129 |     def __init__(self, env):
130 |         gym.RewardWrapper.__init__(self, env)
131 | 
132 |     def reward(self, reward):
133 |         """Bin reward to {+1, 0, -1} by its sign."""
134 |         return np.sign(reward)
135 | 
136 | 
137 | class WarpFrame(gym.ObservationWrapper):
138 |     def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None):
139 |         """
140 |         Warp frames to 84x84 as done in the Nature paper and later work.
141 | 
142 |         If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which
143 |         observation should be warped.
144 |         """
145 |         super().__init__(env)
146 |         self._width = width
147 |         self._height = height
148 |         self._grayscale = grayscale
149 |         self._key = dict_space_key
150 |         if self._grayscale:
151 |             num_colors = 1
152 |         else:
153 |             num_colors = 3
154 | 
155 |         new_space = gym.spaces.Box(
156 |             low=0,
157 |             high=255,
158 |             shape=(self._height, self._width, num_colors),
159 |             dtype=np.uint8,
160 |         )
161 |         if self._key is None:
162 |             original_space = self.observation_space
163 |             self.observation_space = new_space
164 |         else:
165 |             original_space = self.observation_space.spaces[self._key]
166 |             self.observation_space.spaces[self._key] = new_space
167 |         assert original_space.dtype == np.uint8 and len(original_space.shape) == 3
168 | 
169 |     def observation(self, obs):
170 |         if self._key is None:
171 |             frame = obs
172 |         else:
173 |             frame = obs[self._key]
174 | 
175 |         if self._grayscale:
176 |             frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
177 |         frame = cv2.resize(
178 |             frame, (self._width, self._height), interpolation=cv2.INTER_AREA
179 |         )
180 |         if self._grayscale:
181 |             frame = np.expand_dims(frame, -1)
182 | 
183 |         if self._key is None:
184 |             obs = frame
185 |         else:
186 |             obs = obs.copy()
187 |             obs[self._key] = frame
188 |         return obs
189 | 
190 | 
191 | class FrameStack(gym.Wrapper):
192 |     def __init__(self, env, k):
193 |         """Stack k last frames.
194 | 
195 |         Returns lazy array, which is much more memory efficient.
196 | 
197 |         See Also
198 |         --------
199 |         baselines.common.atari_wrappers.LazyFrames
200 |         """
201 |         gym.Wrapper.__init__(self, env)
202 |         self.k = k
203 |         self.frames = deque([], maxlen=k)
204 |         shp = env.observation_space.shape
205 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
206 | 
207 |     def reset(self):
208 |         ob = self.env.reset()
209 |         for _ in range(self.k):
210 |             self.frames.append(ob)
211 |         return self._get_ob()
212 | 
213 |     def step(self, action):
214 |         ob, reward, done, info = self.env.step(action)
215 |         self.frames.append(ob)
216 |         return self._get_ob(), reward, done, info
217 | 
218 |     def _get_ob(self):
219 |         assert len(self.frames) == self.k
220 |         return LazyFrames(list(self.frames))
221 | 
222 | class ScaledFloatFrame(gym.ObservationWrapper):
223 |     def __init__(self, env):
224 |         gym.ObservationWrapper.__init__(self, env)
225 |         self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
226 | 
227 |     def observation(self, observation):
228 |         # careful! This undoes the memory optimization, use
229 |         # with smaller replay buffers only.
230 |         return np.array(observation).astype(np.float32) / 255.0
231 | 
232 | class LazyFrames(object):
233 |     def __init__(self, frames):
234 |         """This object ensures that common frames between the observations are only stored once.
235 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
236 |         buffers.
237 | 
238 |         This object should only be converted to numpy array before being passed to the model.
239 | 
240 |         You'd not believe how complex the previous solution was."""
241 |         self._frames = frames
242 |         self._out = None
243 | 
244 |     def _force(self):
245 |         if self._out is None:
246 |             self._out = np.concatenate(self._frames, axis=-1)
247 |             self._frames = None
248 |         return self._out
249 | 
250 |     def __array__(self, dtype=None):
251 |         out = self._force()
252 |         if dtype is not None:
253 |             out = out.astype(dtype)
254 |         return out
255 | 
256 |     def __len__(self):
257 |         return len(self._force())
258 | 
259 |     def __getitem__(self, i):
260 |         return self._force()[i]
261 | 
262 |     def count(self):
263 |         frames = self._force()
264 |         return frames.shape[frames.ndim - 1]
265 | 
266 |     def frame(self, i):
267 |         return self._force()[..., i]
268 | 
269 | def make_atari(env_id, max_episode_steps=None):
270 |     env = gym.make(env_id)
271 |     assert 'NoFrameskip' in env.spec.id
272 |     env = NoopResetEnv(env, noop_max=30)
273 |     env = MaxAndSkipEnv(env, skip=4)
274 |     if max_episode_steps is not None:
275 |         env = TimeLimit(env, max_episode_steps=max_episode_steps)
276 |     return env
277 | 
278 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
279 |     """Configure environment for DeepMind-style Atari.
280 |     """
281 |     if episode_life:
282 |         env = EpisodicLifeEnv(env)
283 |     if 'FIRE' in env.unwrapped.get_action_meanings():
284 |         env = FireResetEnv(env)
285 |     env = WarpFrame(env)
286 |     if scale:
287 |         env = ScaledFloatFrame(env)
288 |     if clip_rewards:
289 |         env = ClipRewardEnv(env)
290 |     if frame_stack:
291 |         env = FrameStack(env, 4)
292 |     return env
293 | 
294 | # time limit
295 | class TimeLimit(gym.Wrapper):
296 |     def __init__(self, env, max_episode_steps=None):
297 |         super(TimeLimit, self).__init__(env)
298 |         self._max_episode_steps = max_episode_steps
299 |         self._elapsed_steps = 0
300 | 
301 |     def step(self, ac):
302 |         observation, reward, done, info = self.env.step(ac)
303 |         self._elapsed_steps += 1
304 |         if self._elapsed_steps >= self._max_episode_steps:
305 |             done = True
306 |             info['TimeLimit.truncated'] = True
307 |         return observation, reward, done, info
308 | 
309 |     def reset(self, **kwargs):
310 |         self._elapsed_steps = 0
311 |         return self.env.reset(**kwargs)
312 | 


--------------------------------------------------------------------------------
/rl_algorithms/ppo/ppo_agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch import optim
  4 | from rl_utils.running_filter.running_filter import ZFilter
  5 | from models import cnn_net, mlp_net
  6 | from utils import select_actions, evaluate_actions
  7 | from datetime import datetime
  8 | import os
  9 | import copy
 10 | 
 11 | class ppo_agent:
 12 |     def __init__(self, envs, args):
 13 |         self.envs = envs 
 14 |         self.args = args
 15 |         # start to build the network.
 16 |         if self.args.env_type == 'atari':
 17 |             self.net = cnn_net(envs.action_space.n)
 18 |         elif self.args.env_type == 'mujoco':
 19 |             self.net = mlp_net(envs.observation_space.shape[0], envs.action_space.shape[0], self.args.dist)
 20 |         self.old_net = copy.deepcopy(self.net)
 21 |         # if use the cuda...
 22 |         if self.args.cuda:
 23 |             self.net.cuda()
 24 |             self.old_net.cuda()
 25 |         # define the optimizer...
 26 |         self.optimizer = optim.Adam(self.net.parameters(), self.args.lr, eps=self.args.eps)
 27 |         # running filter...
 28 |         if self.args.env_type == 'mujoco':
 29 |             num_states = self.envs.observation_space.shape[0]
 30 |             self.running_state = ZFilter((num_states, ), clip=5)
 31 |         # check saving folder..
 32 |         if not os.path.exists(self.args.save_dir):
 33 |             os.mkdir(self.args.save_dir)
 34 |         # env folder..
 35 |         self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
 36 |         if not os.path.exists(self.model_path):
 37 |             os.mkdir(self.model_path)
 38 |         # get the observation
 39 |         self.batch_ob_shape = (self.args.num_workers * self.args.nsteps, ) + self.envs.observation_space.shape
 40 |         self.obs = np.zeros((self.args.num_workers, ) + self.envs.observation_space.shape, dtype=self.envs.observation_space.dtype.name)
 41 |         if self.args.env_type == 'mujoco':
 42 |             self.obs[:] = np.expand_dims(self.running_state(self.envs.reset()), 0)
 43 |         else:
 44 |             self.obs[:] = self.envs.reset()
 45 |         self.dones = [False for _ in range(self.args.num_workers)]
 46 | 
 47 |     # start to train the network...
 48 |     def learn(self):
 49 |         num_updates = self.args.total_frames // (self.args.nsteps * self.args.num_workers)
 50 |         # get the reward to calculate other informations
 51 |         episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
 52 |         final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
 53 |         for update in range(num_updates):
 54 |             mb_obs, mb_rewards, mb_actions, mb_dones, mb_values = [], [], [], [], []
 55 |             if self.args.lr_decay:
 56 |                 self._adjust_learning_rate(update, num_updates)
 57 |             for step in range(self.args.nsteps):
 58 |                 with torch.no_grad():
 59 |                     # get tensors
 60 |                     obs_tensor = self._get_tensors(self.obs)
 61 |                     values, pis = self.net(obs_tensor)
 62 |                 # select actions
 63 |                 actions = select_actions(pis, self.args.dist, self.args.env_type)
 64 |                 if self.args.env_type == 'atari':
 65 |                     input_actions = actions 
 66 |                 else:
 67 |                     if self.args.dist == 'gauss':
 68 |                         input_actions = actions.copy()
 69 |                     elif self.args.dist == 'beta':
 70 |                         input_actions = -1 + 2 * actions
 71 |                 # start to store information
 72 |                 mb_obs.append(np.copy(self.obs))
 73 |                 mb_actions.append(actions)
 74 |                 mb_dones.append(self.dones)
 75 |                 mb_values.append(values.detach().cpu().numpy().squeeze())
 76 |                 # start to excute the actions in the environment
 77 |                 obs, rewards, dones, _ = self.envs.step(input_actions)
 78 |                 # update dones
 79 |                 if self.args.env_type == 'mujoco':
 80 |                     dones = np.array([dones])
 81 |                     rewards = np.array([rewards])
 82 |                 self.dones = dones
 83 |                 mb_rewards.append(rewards)
 84 |                 # clear the observation
 85 |                 for n, done in enumerate(dones):
 86 |                     if done:
 87 |                         self.obs[n] = self.obs[n] * 0
 88 |                         if self.args.env_type == 'mujoco':
 89 |                             # reset the environment
 90 |                             obs = self.envs.reset()
 91 |                 self.obs = obs if self.args.env_type == 'atari' else np.expand_dims(self.running_state(obs), 0)
 92 |                 # process the rewards part -- display the rewards on the screen
 93 |                 episode_rewards += rewards
 94 |                 masks = np.array([0.0 if done_ else 1.0 for done_ in dones], dtype=np.float32)
 95 |                 final_rewards *= masks
 96 |                 final_rewards += (1 - masks) * episode_rewards
 97 |                 episode_rewards *= masks
 98 |             # process the rollouts
 99 |             mb_obs = np.asarray(mb_obs, dtype=np.float32)
100 |             mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
101 |             mb_actions = np.asarray(mb_actions, dtype=np.float32)
102 |             mb_dones = np.asarray(mb_dones, dtype=np.bool)
103 |             mb_values = np.asarray(mb_values, dtype=np.float32)
104 |             if self.args.env_type == 'mujoco':
105 |                 mb_values = np.expand_dims(mb_values, 1)
106 |             # compute the last state value
107 |             with torch.no_grad():
108 |                 obs_tensor = self._get_tensors(self.obs)
109 |                 last_values, _ = self.net(obs_tensor)
110 |                 last_values = last_values.detach().cpu().numpy().squeeze()
111 |             # start to compute advantages...
112 |             mb_returns = np.zeros_like(mb_rewards)
113 |             mb_advs = np.zeros_like(mb_rewards)
114 |             lastgaelam = 0
115 |             for t in reversed(range(self.args.nsteps)):
116 |                 if t == self.args.nsteps - 1:
117 |                     nextnonterminal = 1.0 - self.dones
118 |                     nextvalues = last_values
119 |                 else:
120 |                     nextnonterminal = 1.0 - mb_dones[t + 1]
121 |                     nextvalues = mb_values[t + 1]
122 |                 delta = mb_rewards[t] + self.args.gamma * nextvalues * nextnonterminal - mb_values[t]
123 |                 mb_advs[t] = lastgaelam = delta + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam
124 |             mb_returns = mb_advs + mb_values
125 |             # after compute the returns, let's process the rollouts
126 |             mb_obs = mb_obs.swapaxes(0, 1).reshape(self.batch_ob_shape)
127 |             if self.args.env_type == 'atari':
128 |                 mb_actions = mb_actions.swapaxes(0, 1).flatten()
129 |             mb_returns = mb_returns.swapaxes(0, 1).flatten()
130 |             mb_advs = mb_advs.swapaxes(0, 1).flatten()
131 |             # before update the network, the old network will try to load the weights
132 |             self.old_net.load_state_dict(self.net.state_dict())
133 |             # start to update the network
134 |             pl, vl, ent = self._update_network(mb_obs, mb_actions, mb_returns, mb_advs)
135 |             # display the training information
136 |             if update % self.args.display_interval == 0:
137 |                 print('[{}] Update: {} / {}, Frames: {}, Rewards: {:.3f}, Min: {:.3f}, Max: {:.3f}, PL: {:.3f},'\
138 |                     'VL: {:.3f}, Ent: {:.3f}'.format(datetime.now(), update, num_updates, (update + 1)*self.args.nsteps*self.args.num_workers, \
139 |                     final_rewards.mean(), final_rewards.min(), final_rewards.max(), pl, vl, ent))
140 |                 # save the model
141 |                 if self.args.env_type == 'atari':
142 |                     torch.save(self.net.state_dict(), self.model_path + '/model.pt')
143 |                 else:
144 |                     # for the mujoco, we also need to keep the running mean filter!
145 |                     torch.save([self.net.state_dict(), self.running_state], self.model_path + '/model.pt')
146 | 
147 |     # update the network
148 |     def _update_network(self, obs, actions, returns, advantages):
149 |         inds = np.arange(obs.shape[0])
150 |         nbatch_train = obs.shape[0] // self.args.batch_size
151 |         for _ in range(self.args.epoch):
152 |             np.random.shuffle(inds)
153 |             for start in range(0, obs.shape[0], nbatch_train):
154 |                 # get the mini-batchs
155 |                 end = start + nbatch_train
156 |                 mbinds = inds[start:end]
157 |                 mb_obs = obs[mbinds]
158 |                 mb_actions = actions[mbinds]
159 |                 mb_returns = returns[mbinds]
160 |                 mb_advs = advantages[mbinds]
161 |                 # convert minibatches to tensor
162 |                 mb_obs = self._get_tensors(mb_obs)
163 |                 mb_actions = torch.tensor(mb_actions, dtype=torch.float32)
164 |                 mb_returns = torch.tensor(mb_returns, dtype=torch.float32).unsqueeze(1)
165 |                 mb_advs = torch.tensor(mb_advs, dtype=torch.float32).unsqueeze(1)
166 |                 # normalize adv
167 |                 mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-8)
168 |                 if self.args.cuda:
169 |                     mb_actions = mb_actions.cuda()
170 |                     mb_returns = mb_returns.cuda()
171 |                     mb_advs = mb_advs.cuda()
172 |                 # start to get values
173 |                 mb_values, pis = self.net(mb_obs)
174 |                 # start to calculate the value loss...
175 |                 value_loss = (mb_returns - mb_values).pow(2).mean()
176 |                 # start to calculate the policy loss
177 |                 with torch.no_grad():
178 |                     _, old_pis = self.old_net(mb_obs)
179 |                     # get the old log probs
180 |                     old_log_prob, _ = evaluate_actions(old_pis, mb_actions, self.args.dist, self.args.env_type)
181 |                     old_log_prob = old_log_prob.detach()
182 |                 # evaluate the current policy
183 |                 log_prob, ent_loss = evaluate_actions(pis, mb_actions, self.args.dist, self.args.env_type)
184 |                 prob_ratio = torch.exp(log_prob - old_log_prob)
185 |                 # surr1
186 |                 surr1 = prob_ratio * mb_advs
187 |                 surr2 = torch.clamp(prob_ratio, 1 - self.args.clip, 1 + self.args.clip) * mb_advs
188 |                 policy_loss = -torch.min(surr1, surr2).mean()
189 |                 # final total loss
190 |                 total_loss = policy_loss + self.args.vloss_coef * value_loss - ent_loss * self.args.ent_coef
191 |                 # clear the grad buffer
192 |                 self.optimizer.zero_grad()
193 |                 total_loss.backward()
194 |                 torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.args.max_grad_norm)
195 |                 # update
196 |                 self.optimizer.step()
197 |         return policy_loss.item(), value_loss.item(), ent_loss.item()
198 | 
199 |     # convert the numpy array to tensors
200 |     def _get_tensors(self, obs):
201 |         if self.args.env_type == 'atari':
202 |             obs_tensor = torch.tensor(np.transpose(obs, (0, 3, 1, 2)), dtype=torch.float32)
203 |         else:
204 |             obs_tensor = torch.tensor(obs, dtype=torch.float32)
205 |         # decide if put the tensor on the GPU
206 |         if self.args.cuda:
207 |             obs_tensor = obs_tensor.cuda()
208 |         return obs_tensor
209 | 
210 |     # adjust the learning rate
211 |     def _adjust_learning_rate(self, update, num_updates):
212 |         lr_frac = 1 - (update / num_updates)
213 |         adjust_lr = self.args.lr * lr_frac
214 |         for param_group in self.optimizer.param_groups:
215 |              param_group['lr'] = adjust_lr
216 | 


--------------------------------------------------------------------------------
/rl_utils/logger/logger.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import shutil
  4 | import os.path as osp
  5 | import json
  6 | import time
  7 | import datetime
  8 | import tempfile
  9 | from collections import defaultdict
 10 | from contextlib import contextmanager
 11 | 
 12 | DEBUG = 10
 13 | INFO = 20
 14 | WARN = 30
 15 | ERROR = 40
 16 | 
 17 | DISABLED = 50
 18 | 
 19 | class KVWriter(object):
 20 |     def writekvs(self, kvs):
 21 |         raise NotImplementedError
 22 | 
 23 | class SeqWriter(object):
 24 |     def writeseq(self, seq):
 25 |         raise NotImplementedError
 26 | 
 27 | class HumanOutputFormat(KVWriter, SeqWriter):
 28 |     def __init__(self, filename_or_file):
 29 |         if isinstance(filename_or_file, str):
 30 |             self.file = open(filename_or_file, 'wt')
 31 |             self.own_file = True
 32 |         else:
 33 |             assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s'%filename_or_file
 34 |             self.file = filename_or_file
 35 |             self.own_file = False
 36 | 
 37 |     def writekvs(self, kvs):
 38 |         # Create strings for printing
 39 |         key2str = {}
 40 |         for (key, val) in sorted(kvs.items()):
 41 |             if hasattr(val, '__float__'):
 42 |                 valstr = '%-8.3g' % val
 43 |             else:
 44 |                 valstr = str(val)
 45 |             key2str[self._truncate(key)] = self._truncate(valstr)
 46 | 
 47 |         # Find max widths
 48 |         if len(key2str) == 0:
 49 |             print('WARNING: tried to write empty key-value dict')
 50 |             return
 51 |         else:
 52 |             keywidth = max(map(len, key2str.keys()))
 53 |             valwidth = max(map(len, key2str.values()))
 54 | 
 55 |         # Write out the data
 56 |         dashes = '-' * (keywidth + valwidth + 7)
 57 |         lines = [dashes]
 58 |         for (key, val) in sorted(key2str.items(), key=lambda kv: kv[0].lower()):
 59 |             lines.append('| %s%s | %s%s |' % (
 60 |                 key,
 61 |                 ' ' * (keywidth - len(key)),
 62 |                 val,
 63 |                 ' ' * (valwidth - len(val)),
 64 |             ))
 65 |         lines.append(dashes)
 66 |         self.file.write('\n'.join(lines) + '\n')
 67 | 
 68 |         # Flush the output to the file
 69 |         self.file.flush()
 70 | 
 71 |     def _truncate(self, s):
 72 |         maxlen = 30
 73 |         return s[:maxlen-3] + '...' if len(s) > maxlen else s
 74 | 
 75 |     def writeseq(self, seq):
 76 |         seq = list(seq)
 77 |         for (i, elem) in enumerate(seq):
 78 |             self.file.write(elem)
 79 |             if i < len(seq) - 1: # add space unless this is the last one
 80 |                 self.file.write(' ')
 81 |         self.file.write('\n')
 82 |         self.file.flush()
 83 | 
 84 |     def close(self):
 85 |         if self.own_file:
 86 |             self.file.close()
 87 | 
 88 | class JSONOutputFormat(KVWriter):
 89 |     def __init__(self, filename):
 90 |         self.file = open(filename, 'wt')
 91 | 
 92 |     def writekvs(self, kvs):
 93 |         for k, v in sorted(kvs.items()):
 94 |             if hasattr(v, 'dtype'):
 95 |                 kvs[k] = float(v)
 96 |         self.file.write(json.dumps(kvs) + '\n')
 97 |         self.file.flush()
 98 | 
 99 |     def close(self):
100 |         self.file.close()
101 | 
102 | class CSVOutputFormat(KVWriter):
103 |     def __init__(self, filename):
104 |         self.file = open(filename, 'w+t')
105 |         self.keys = []
106 |         self.sep = ','
107 | 
108 |     def writekvs(self, kvs):
109 |         # Add our current row to the history
110 |         extra_keys = list(kvs.keys() - self.keys)
111 |         extra_keys.sort()
112 |         if extra_keys:
113 |             self.keys.extend(extra_keys)
114 |             self.file.seek(0)
115 |             lines = self.file.readlines()
116 |             self.file.seek(0)
117 |             for (i, k) in enumerate(self.keys):
118 |                 if i > 0:
119 |                     self.file.write(',')
120 |                 self.file.write(k)
121 |             self.file.write('\n')
122 |             for line in lines[1:]:
123 |                 self.file.write(line[:-1])
124 |                 self.file.write(self.sep * len(extra_keys))
125 |                 self.file.write('\n')
126 |         for (i, k) in enumerate(self.keys):
127 |             if i > 0:
128 |                 self.file.write(',')
129 |             v = kvs.get(k)
130 |             if v is not None:
131 |                 self.file.write(str(v))
132 |         self.file.write('\n')
133 |         self.file.flush()
134 | 
135 |     def close(self):
136 |         self.file.close()
137 | 
138 | 
139 | class TensorBoardOutputFormat(KVWriter):
140 |     """
141 |     Dumps key/value pairs into TensorBoard's numeric format.
142 |     """
143 |     def __init__(self, dir):
144 |         os.makedirs(dir, exist_ok=True)
145 |         self.dir = dir
146 |         self.step = 1
147 |         prefix = 'events'
148 |         path = osp.join(osp.abspath(dir), prefix)
149 |         import tensorflow as tf
150 |         from tensorflow.python import pywrap_tensorflow
151 |         from tensorflow.core.util import event_pb2
152 |         from tensorflow.python.util import compat
153 |         self.tf = tf
154 |         self.event_pb2 = event_pb2
155 |         self.pywrap_tensorflow = pywrap_tensorflow
156 |         self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path))
157 | 
158 |     def writekvs(self, kvs):
159 |         def summary_val(k, v):
160 |             kwargs = {'tag': k, 'simple_value': float(v)}
161 |             return self.tf.Summary.Value(**kwargs)
162 |         summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()])
163 |         event = self.event_pb2.Event(wall_time=time.time(), summary=summary)
164 |         event.step = self.step # is there any reason why you'd want to specify the step?
165 |         self.writer.WriteEvent(event)
166 |         self.writer.Flush()
167 |         self.step += 1
168 | 
169 |     def close(self):
170 |         if self.writer:
171 |             self.writer.Close()
172 |             self.writer = None
173 | 
174 | def make_output_format(format, ev_dir, log_suffix=''):
175 |     os.makedirs(ev_dir, exist_ok=True)
176 |     if format == 'stdout':
177 |         return HumanOutputFormat(sys.stdout)
178 |     elif format == 'log':
179 |         return HumanOutputFormat(osp.join(ev_dir, 'log%s.txt' % log_suffix))
180 |     elif format == 'json':
181 |         return JSONOutputFormat(osp.join(ev_dir, 'progress%s.json' % log_suffix))
182 |     elif format == 'csv':
183 |         return CSVOutputFormat(osp.join(ev_dir, 'progress%s.csv' % log_suffix))
184 |     elif format == 'tensorboard':
185 |         return TensorBoardOutputFormat(osp.join(ev_dir, 'tb%s' % log_suffix))
186 |     else:
187 |         raise ValueError('Unknown format specified: %s' % (format,))
188 | 
189 | # ================================================================
190 | # API
191 | # ================================================================
192 | 
193 | def logkv(key, val):
194 |     """
195 |     Log a value of some diagnostic
196 |     Call this once for each diagnostic quantity, each iteration
197 |     If called many times, last value will be used.
198 |     """
199 |     get_current().logkv(key, val)
200 | 
201 | def logkv_mean(key, val):
202 |     """
203 |     The same as logkv(), but if called many times, values averaged.
204 |     """
205 |     get_current().logkv_mean(key, val)
206 | 
207 | def logkvs(d):
208 |     """
209 |     Log a dictionary of key-value pairs
210 |     """
211 |     for (k, v) in d.items():
212 |         logkv(k, v)
213 | 
214 | def dumpkvs():
215 |     """
216 |     Write all of the diagnostics from the current iteration
217 |     """
218 |     return get_current().dumpkvs()
219 | 
220 | def getkvs():
221 |     return get_current().name2val
222 | 
223 | 
224 | def log(*args, level=INFO):
225 |     """
226 |     Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
227 |     """
228 |     get_current().log(*args, level=level)
229 | 
230 | def debug(*args):
231 |     log(*args, level=DEBUG)
232 | 
233 | def info(*args):
234 |     log(*args, level=INFO)
235 | 
236 | def warn(*args):
237 |     log(*args, level=WARN)
238 | 
239 | def error(*args):
240 |     log(*args, level=ERROR)
241 | 
242 | 
243 | def set_level(level):
244 |     """
245 |     Set logging threshold on current logger.
246 |     """
247 |     get_current().set_level(level)
248 | 
249 | def set_comm(comm):
250 |     get_current().set_comm(comm)
251 | 
252 | def get_dir():
253 |     """
254 |     Get directory that log files are being written to.
255 |     will be None if there is no output directory (i.e., if you didn't call start)
256 |     """
257 |     return get_current().get_dir()
258 | 
259 | record_tabular = logkv
260 | dump_tabular = dumpkvs
261 | 
262 | @contextmanager
263 | def profile_kv(scopename):
264 |     logkey = 'wait_' + scopename
265 |     tstart = time.time()
266 |     try:
267 |         yield
268 |     finally:
269 |         get_current().name2val[logkey] += time.time() - tstart
270 | 
271 | def profile(n):
272 |     """
273 |     Usage:
274 |     @profile("my_func")
275 |     def my_func(): code
276 |     """
277 |     def decorator_with_name(func):
278 |         def func_wrapper(*args, **kwargs):
279 |             with profile_kv(n):
280 |                 return func(*args, **kwargs)
281 |         return func_wrapper
282 |     return decorator_with_name
283 | 
284 | 
285 | # ================================================================
286 | # Backend
287 | # ================================================================
288 | 
289 | def get_current():
290 |     if Logger.CURRENT is None:
291 |         _configure_default_logger()
292 | 
293 |     return Logger.CURRENT
294 | 
295 | 
296 | class Logger(object):
297 |     DEFAULT = None  # A logger with no output files. (See right below class definition)
298 |                     # So that you can still log to the terminal without setting up any output files
299 |     CURRENT = None  # Current logger being used by the free functions above
300 | 
301 |     def __init__(self, dir, output_formats, comm=None):
302 |         self.name2val = defaultdict(float)  # values this iteration
303 |         self.name2cnt = defaultdict(int)
304 |         self.level = INFO
305 |         self.dir = dir
306 |         self.output_formats = output_formats
307 |         self.comm = comm
308 | 
309 |     # Logging API, forwarded
310 |     # ----------------------------------------
311 |     def logkv(self, key, val):
312 |         self.name2val[key] = val
313 | 
314 |     def logkv_mean(self, key, val):
315 |         oldval, cnt = self.name2val[key], self.name2cnt[key]
316 |         self.name2val[key] = oldval*cnt/(cnt+1) + val/(cnt+1)
317 |         self.name2cnt[key] = cnt + 1
318 | 
319 |     def dumpkvs(self):
320 |         if self.comm is None:
321 |             d = self.name2val
322 |         else:
323 |             from baselines.common import mpi_util
324 |             d = mpi_util.mpi_weighted_mean(self.comm,
325 |                 {name : (val, self.name2cnt.get(name, 1))
326 |                     for (name, val) in self.name2val.items()})
327 |             if self.comm.rank != 0:
328 |                 d['dummy'] = 1 # so we don't get a warning about empty dict
329 |         out = d.copy() # Return the dict for unit testing purposes
330 |         for fmt in self.output_formats:
331 |             if isinstance(fmt, KVWriter):
332 |                 fmt.writekvs(d)
333 |         self.name2val.clear()
334 |         self.name2cnt.clear()
335 |         return out
336 | 
337 |     def log(self, *args, level=INFO):
338 |         if self.level <= level:
339 |             self._do_log(args)
340 | 
341 |     # Configuration
342 |     # ----------------------------------------
343 |     def set_level(self, level):
344 |         self.level = level
345 | 
346 |     def set_comm(self, comm):
347 |         self.comm = comm
348 | 
349 |     def get_dir(self):
350 |         return self.dir
351 | 
352 |     def close(self):
353 |         for fmt in self.output_formats:
354 |             fmt.close()
355 | 
356 |     # Misc
357 |     # ----------------------------------------
358 |     def _do_log(self, args):
359 |         for fmt in self.output_formats:
360 |             if isinstance(fmt, SeqWriter):
361 |                 fmt.writeseq(map(str, args))
362 | 
363 | def get_rank_without_mpi_import():
364 |     # check environment variables here instead of importing mpi4py
365 |     # to avoid calling MPI_Init() when this module is imported
366 |     for varname in ['PMI_RANK', 'OMPI_COMM_WORLD_RANK']:
367 |         if varname in os.environ:
368 |             return int(os.environ[varname])
369 |     return 0
370 | 
371 | 
372 | def configure(dir=None, format_strs=None, comm=None, log_suffix=''):
373 |     """
374 |     If comm is provided, average all numerical stats across that comm
375 |     """
376 |     if dir is None:
377 |         dir = os.getenv('OPENAI_LOGDIR')
378 |     if dir is None:
379 |         dir = osp.join(tempfile.gettempdir(),
380 |             datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
381 |     assert isinstance(dir, str)
382 |     dir = os.path.expanduser(dir)
383 |     os.makedirs(os.path.expanduser(dir), exist_ok=True)
384 | 
385 |     rank = get_rank_without_mpi_import()
386 |     if rank > 0:
387 |         log_suffix = log_suffix + "-rank%03i" % rank
388 | 
389 |     if format_strs is None:
390 |         if rank == 0:
391 |             format_strs = os.getenv('OPENAI_LOG_FORMAT', 'stdout,log,csv').split(',')
392 |         else:
393 |             format_strs = os.getenv('OPENAI_LOG_FORMAT_MPI', 'log').split(',')
394 |     format_strs = filter(None, format_strs)
395 |     output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
396 | 
397 |     Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm)
398 |     if output_formats:
399 |         log('Logging to %s'%dir)
400 | 
401 | def _configure_default_logger():
402 |     configure()
403 |     Logger.DEFAULT = Logger.CURRENT
404 | 
405 | def reset():
406 |     if Logger.CURRENT is not Logger.DEFAULT:
407 |         Logger.CURRENT.close()
408 |         Logger.CURRENT = Logger.DEFAULT
409 |         log('Reset logger')
410 | 
411 | @contextmanager
412 | def scoped_configure(dir=None, format_strs=None, comm=None):
413 |     prevlogger = Logger.CURRENT
414 |     configure(dir=dir, format_strs=format_strs, comm=comm)
415 |     try:
416 |         yield
417 |     finally:
418 |         Logger.CURRENT.close()
419 |         Logger.CURRENT = prevlogger
420 | 
421 | # ================================================================
422 | 
423 | def _demo():
424 |     info("hi")
425 |     debug("shouldn't appear")
426 |     set_level(DEBUG)
427 |     debug("should appear")
428 |     dir = "/tmp/testlogging"
429 |     if os.path.exists(dir):
430 |         shutil.rmtree(dir)
431 |     configure(dir=dir)
432 |     logkv("a", 3)
433 |     logkv("b", 2.5)
434 |     dumpkvs()
435 |     logkv("b", -2.5)
436 |     logkv("a", 5.5)
437 |     dumpkvs()
438 |     info("^^^ should see a = 5.5")
439 |     logkv_mean("b", -22.5)
440 |     logkv_mean("b", -44.4)
441 |     logkv("a", 5.5)
442 |     dumpkvs()
443 |     info("^^^ should see b = -33.3")
444 | 
445 |     logkv("b", -2.5)
446 |     dumpkvs()
447 | 
448 |     logkv("a", "longasslongasslongasslongasslongasslongassvalue")
449 |     dumpkvs()
450 | 
451 | 
452 | # ================================================================
453 | # Readers
454 | # ================================================================
455 | 
456 | def read_json(fname):
457 |     import pandas
458 |     ds = []
459 |     with open(fname, 'rt') as fh:
460 |         for line in fh:
461 |             ds.append(json.loads(line))
462 |     return pandas.DataFrame(ds)
463 | 
464 | def read_csv(fname):
465 |     import pandas
466 |     return pandas.read_csv(fname, index_col=None, comment='#')
467 | 
468 | def read_tb(path):
469 |     """
470 |     path : a tensorboard file OR a directory, where we will find all TB files
471 |            of the form events.*
472 |     """
473 |     import pandas
474 |     import numpy as np
475 |     from glob import glob
476 |     import tensorflow as tf
477 |     if osp.isdir(path):
478 |         fnames = glob(osp.join(path, "events.*"))
479 |     elif osp.basename(path).startswith("events."):
480 |         fnames = [path]
481 |     else:
482 |         raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s"%path)
483 |     tag2pairs = defaultdict(list)
484 |     maxstep = 0
485 |     for fname in fnames:
486 |         for summary in tf.train.summary_iterator(fname):
487 |             if summary.step > 0:
488 |                 for v in summary.summary.value:
489 |                     pair = (summary.step, v.simple_value)
490 |                     tag2pairs[v.tag].append(pair)
491 |                 maxstep = max(summary.step, maxstep)
492 |     data = np.empty((maxstep, len(tag2pairs)))
493 |     data[:] = np.nan
494 |     tags = sorted(tag2pairs.keys())
495 |     for (colidx,tag) in enumerate(tags):
496 |         pairs = tag2pairs[tag]
497 |         for (step, value) in pairs:
498 |             data[step-1, colidx] = value
499 |     return pandas.DataFrame(data, columns=tags)
500 | 
501 | if __name__ == "__main__":
502 |     _demo()
503 | 


--------------------------------------------------------------------------------