├── rl_utils ├── __init__.py ├── logger │ ├── __init__.py │ ├── plot.py │ ├── bench.py │ └── logger.py ├── mpi_utils │ ├── __init__.py │ ├── utils.py │ └── normalizer.py ├── running_filter │ ├── __init__.py │ └── running_filter.py ├── seeds │ └── seeds.py ├── env_wrapper │ ├── frame_stack.py │ ├── create_env.py │ ├── multi_envs_wrapper.py │ ├── __init__.py │ └── atari_wrapper.py └── experience_replay │ └── experience_replay.py ├── figures ├── logo.png ├── 01_dqn.png ├── 03_a2c.png ├── 05_ppo.png ├── 06_sac.png ├── hopper.gif ├── 02_ddpg.png ├── 04_trpo.png ├── bipedal.gif └── breakout.gif ├── rl_algorithms ├── trpo │ ├── README.md │ ├── train.py │ ├── demo.py │ ├── models.py │ ├── arguments.py │ ├── utils.py │ └── trpo_agent.py ├── a2c │ ├── README.md │ ├── train.py │ ├── utils.py │ ├── demo.py │ ├── arguments.py │ ├── models.py │ └── a2c_agent.py ├── sac │ ├── README.md │ ├── train.py │ ├── demo.py │ ├── models.py │ ├── utils.py │ ├── arguments.py │ └── sac_agent.py ├── ddpg │ ├── README.md │ ├── utils.py │ ├── train.py │ ├── models.py │ ├── demo.py │ ├── arguments.py │ └── ddpg_agent.py ├── dqn_algos │ ├── README.md │ ├── train.py │ ├── demo.py │ ├── utils.py │ ├── arguments.py │ ├── models.py │ └── dqn_agent.py └── ppo │ ├── README.md │ ├── train.py │ ├── utils.py │ ├── arguments.py │ ├── demo.py │ ├── models.py │ └── ppo_agent.py ├── setup.py ├── .gitignore └── README.md /rl_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_utils/logger/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_utils/mpi_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_utils/running_filter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/logo.png -------------------------------------------------------------------------------- /figures/01_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/01_dqn.png -------------------------------------------------------------------------------- /figures/03_a2c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/03_a2c.png -------------------------------------------------------------------------------- /figures/05_ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/05_ppo.png -------------------------------------------------------------------------------- /figures/06_sac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/06_sac.png -------------------------------------------------------------------------------- /figures/hopper.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/hopper.gif -------------------------------------------------------------------------------- /figures/02_ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/02_ddpg.png -------------------------------------------------------------------------------- /figures/04_trpo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/04_trpo.png -------------------------------------------------------------------------------- /figures/bipedal.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/bipedal.gif -------------------------------------------------------------------------------- /figures/breakout.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TianhongDai/reinforcement-learning-algorithms/HEAD/figures/breakout.gif -------------------------------------------------------------------------------- /rl_algorithms/trpo/README.md: -------------------------------------------------------------------------------- 1 | # Trust Region Policy Gradient (TRPO) 2 | ## Instructions 3 | 1. Train the agents (GPU is not supported): 4 | ```bash 5 | python train.py --env-name='' 6 | ``` 7 | 2. Play the demo: 8 | ```bash 9 | python demo.py --env-name='' 10 | ``` 11 | ## Results 12 | ![](../../figures/04_trpo.png) 13 | -------------------------------------------------------------------------------- /rl_algorithms/a2c/README.md: -------------------------------------------------------------------------------- 1 | # Synchronous Advantage Actor-Critic (A2C) 2 | ## Instructions 3 | 1. Train the agents: 4 | ```bash 5 | python train.py --env-name='' --cuda (if you have a GPU) 6 | ``` 7 | 2. Play the demo: 8 | ```bash 9 | python demo.py --env-name='' 10 | ``` 11 | ## Results 12 | ![](../../figures/03_a2c.png) 13 | -------------------------------------------------------------------------------- /rl_algorithms/sac/README.md: -------------------------------------------------------------------------------- 1 | # Soft Actor-Critic (SAC) 2 | ## Instructions 3 | 1. Train the agents: 4 | ```bash 5 | python train.py --env-name='' --cuda (if you have a GPU) -- 6 | ``` 7 | 2. Play the demo: 8 | ```bash 9 | python demo.py --env-name='' 10 | ``` 11 | ## Results 12 | ![](../../figures/06_sac.png) 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | """ 4 | install the packages 5 | 6 | """ 7 | setup(name='rl_utils', 8 | version='0.0', 9 | description='rl utils for the rl algorithms', 10 | author='Tianhong Dai', 11 | author_email='xxx@xxx.com', 12 | url='no', 13 | packages=['rl_utils'], 14 | ) 15 | -------------------------------------------------------------------------------- /rl_algorithms/ddpg/README.md: -------------------------------------------------------------------------------- 1 | # Deep Deterministic Policy Gradient (DDPG) 2 | ## Instructions 3 | 1. Train the agents (GPU is not supported, will support it in the future): 4 | ```bash 5 | mpirun -np 1 python -u train.py --env-name='' -- 2>&1 | tee exp_ddpg.log 6 | ``` 7 | 2. Play the demo: 8 | ```bash 9 | python demo.py --env-name='' 10 | ``` 11 | ## Results 12 | ![](../../figures/02_ddpg.png) 13 | -------------------------------------------------------------------------------- /rl_utils/seeds/seeds.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import torch 4 | 5 | # set random seeds for the pytorch, numpy and random 6 | def set_seeds(args, rank=0): 7 | # set seeds for the numpy 8 | np.random.seed(args.seed + rank) 9 | # set seeds for the random.random 10 | random.seed(args.seed + rank) 11 | # set seeds for the pytorch 12 | torch.manual_seed(args.seed + rank) 13 | if args.cuda: 14 | torch.cuda.manual_seed(args.seed + rank) 15 | -------------------------------------------------------------------------------- /rl_algorithms/dqn_algos/README.md: -------------------------------------------------------------------------------- 1 | # Deep Q Networks (DQN) 2 | ## Instructions 3 | 1. Train the agents, can use flag `--use-dueling` and `--use-double-net` to try the Double DQN or Dueling Network Architecture: 4 | ```bash 5 | python train.py --env-name='' --cuda (if you have a GPU) -- 6 | ``` 7 | 2. Play the demo - Please use the same algorithm flag as training: 8 | ```bash 9 | python demo.py --env-name='' -- 10 | ``` 11 | ## Results 12 | ![](../../figures/01_dqn.png) 13 | -------------------------------------------------------------------------------- /rl_algorithms/sac/train.py: -------------------------------------------------------------------------------- 1 | from arguments import get_args 2 | from sac_agent import sac_agent 3 | from rl_utils.seeds.seeds import set_seeds 4 | from rl_utils.env_wrapper.create_env import create_single_env 5 | 6 | if __name__ == '__main__': 7 | args = get_args() 8 | # build the environment 9 | env = create_single_env(args) 10 | # set the seeds 11 | set_seeds(args) 12 | # create the agent 13 | sac_trainer = sac_agent(env, args) 14 | sac_trainer.learn() 15 | # close the environment 16 | env.close() 17 | -------------------------------------------------------------------------------- /rl_algorithms/trpo/train.py: -------------------------------------------------------------------------------- 1 | from arguments import get_args 2 | from rl_utils.seeds.seeds import set_seeds 3 | from rl_utils.env_wrapper.create_env import create_single_env 4 | from trpo_agent import trpo_agent 5 | 6 | if __name__ == '__main__': 7 | args = get_args() 8 | # make environemnts 9 | env = create_single_env(args) 10 | # set the random seeds 11 | set_seeds(args) 12 | # create trpo trainer 13 | trpo_trainer = trpo_agent(env, args) 14 | trpo_trainer.learn() 15 | # close the environment 16 | env.close() 17 | -------------------------------------------------------------------------------- /rl_algorithms/dqn_algos/train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from arguments import get_args 3 | from rl_utils.env_wrapper.create_env import create_single_env 4 | from rl_utils.logger import logger, bench 5 | from rl_utils.seeds.seeds import set_seeds 6 | from dqn_agent import dqn_agent 7 | import os 8 | import numpy as np 9 | 10 | if __name__ == '__main__': 11 | # get arguments 12 | args = get_args() 13 | # start to create the environment 14 | env = create_single_env(args) 15 | # set seeds 16 | set_seeds(args) 17 | # create trainer 18 | dqn_trainer = dqn_agent(env, args) 19 | # start to learn 20 | dqn_trainer.learn() 21 | # finally - close the environment 22 | env.close() 23 | -------------------------------------------------------------------------------- /rl_algorithms/a2c/train.py: -------------------------------------------------------------------------------- 1 | from arguments import get_args 2 | from a2c_agent import a2c_agent 3 | from rl_utils.env_wrapper.create_env import create_multiple_envs 4 | from rl_utils.seeds.seeds import set_seeds 5 | from a2c_agent import a2c_agent 6 | import os 7 | 8 | if __name__ == '__main__': 9 | # set signle thread 10 | os.environ['OMP_NUM_THREADS'] = '1' 11 | os.environ['MKL_NUM_THREADS'] = '1' 12 | # get args 13 | args = get_args() 14 | # create environments 15 | envs = create_multiple_envs(args) 16 | # set seeds 17 | set_seeds(args) 18 | # create trainer 19 | a2c_trainer = a2c_agent(envs, args) 20 | a2c_trainer.learn() 21 | # close the environment 22 | envs.close() 23 | -------------------------------------------------------------------------------- /rl_algorithms/ddpg/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | # add ounoise here 5 | class ounoise(): 6 | def __init__(self, std, action_dim, mean=0, theta=0.15, dt=1e-2, x0=None): 7 | self.std = std 8 | self.mean = mean 9 | self.action_dim = action_dim 10 | self.theta = theta 11 | self.dt = dt 12 | self.x0 = x0 13 | 14 | # reset the noise 15 | def reset(self): 16 | self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.action_dim) 17 | 18 | # generate noise 19 | def noise(self): 20 | x = self.x_prev + self.theta * (self.mean - self.x_prev) * self.dt + \ 21 | self.std * np.sqrt(self.dt) * np.random.normal(size=self.action_dim) 22 | self.x_prev = x 23 | return x 24 | -------------------------------------------------------------------------------- /rl_algorithms/ddpg/train.py: -------------------------------------------------------------------------------- 1 | from ddpg_agent import ddpg_agent 2 | from arguments import get_args 3 | from rl_utils.seeds.seeds import set_seeds 4 | from rl_utils.env_wrapper.create_env import create_single_env 5 | from mpi4py import MPI 6 | import os 7 | 8 | if __name__ == '__main__': 9 | # set thread and mpi stuff 10 | os.environ['OMP_NUM_THREADS'] = '1' 11 | os.environ['MKL_NUM_THREADS'] = '1' 12 | os.environ['IN_MPI'] = '1' 13 | # train the network 14 | args = get_args() 15 | # build up the environment 16 | env = create_single_env(args, MPI.COMM_WORLD.Get_rank()) 17 | # set the random seeds 18 | set_seeds(args, MPI.COMM_WORLD.Get_rank()) 19 | # start traininng 20 | ddpg_trainer = ddpg_agent(env, args) 21 | ddpg_trainer.learn() 22 | # close the environment 23 | env.close() 24 | -------------------------------------------------------------------------------- /rl_algorithms/ppo/README.md: -------------------------------------------------------------------------------- 1 | # Proximal Policy Optimization (PPO) 2 | ## Instructions 3 | 1. Train the agents - **Atari Env**: 4 | ```bash 5 | python train.py --env-name='' --cuda (if you have a GPU) --env-type='atari' --lr-decay 6 | ``` 7 | 2. Train the agents - **Mujoco Env** (we also support beta distribution, can use `--dist` flag): 8 | ```bash 9 | python train.py --env-name='' --cuda (if you have a GPU) --env-type='mujoco' --num-workers=1 --nsteps=2048 --clip=0.2 --batch-size=32 --epoch=10 --lr=3e-4 --ent-coef=0 --total-frames=1000000 --vloss-coef=1 10 | ``` 11 | 3. Play the demo - Please use the same `--env-type` and `--dist` flag used in the training. 12 | ```bash 13 | python demo.py --env-name='' --env-type='' --dist='' 14 | ``` 15 | ## Results 16 | ![](../../figures/05_ppo.png) 17 | -------------------------------------------------------------------------------- /rl_algorithms/a2c/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch.distributions.categorical import Categorical 4 | 5 | # select - actions 6 | def select_actions(pi, deterministic=False): 7 | cate_dist = Categorical(pi) 8 | if deterministic: 9 | return torch.argmax(pi, dim=1).item() 10 | else: 11 | return cate_dist.sample().unsqueeze(-1) 12 | 13 | # get the action log prob and entropy... 14 | def evaluate_actions(pi, actions): 15 | cate_dist = Categorical(pi) 16 | return cate_dist.log_prob(actions.squeeze(-1)).unsqueeze(-1), cate_dist.entropy().mean() 17 | 18 | def discount_with_dones(rewards, dones, gamma): 19 | discounted = [] 20 | r = 0 21 | for reward, done in zip(rewards[::-1], dones[::-1]): 22 | r = reward + gamma * r * (1.-done) 23 | discounted.append(r) 24 | return discounted[::-1] 25 | -------------------------------------------------------------------------------- /rl_algorithms/ppo/train.py: -------------------------------------------------------------------------------- 1 | from arguments import get_args 2 | from ppo_agent import ppo_agent 3 | from rl_utils.env_wrapper.create_env import create_multiple_envs, create_single_env 4 | from rl_utils.seeds.seeds import set_seeds 5 | import os 6 | 7 | if __name__ == '__main__': 8 | # set signle thread 9 | os.environ['OMP_NUM_THREADS'] = '1' 10 | os.environ['MKL_NUM_THREADS'] = '1' 11 | # get arguments 12 | args = get_args() 13 | # start to create the environment 14 | if args.env_type == 'atari': 15 | envs = create_multiple_envs(args) 16 | elif args.env_type == 'mujoco': 17 | envs = create_single_env(args) 18 | else: 19 | raise NotImplementedError 20 | # create trainer 21 | ppo_trainer = ppo_agent(envs, args) 22 | # start to learn 23 | ppo_trainer.learn() 24 | # close the environment 25 | envs.close() 26 | -------------------------------------------------------------------------------- /rl_algorithms/ddpg/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | # define the actor network 6 | class actor(nn.Module): 7 | def __init__(self, obs_dims, action_dims): 8 | super(actor, self).__init__() 9 | self.fc1 = nn.Linear(obs_dims, 400) 10 | self.fc2 = nn.Linear(400, 300) 11 | self.action_out = nn.Linear(300, action_dims) 12 | 13 | def forward(self, x): 14 | x = F.relu(self.fc1(x)) 15 | x = F.relu(self.fc2(x)) 16 | actions = torch.tanh(self.action_out(x)) 17 | return actions 18 | 19 | class critic(nn.Module): 20 | def __init__(self, obs_dims, action_dims): 21 | super(critic, self).__init__() 22 | self.fc1 = nn.Linear(obs_dims, 400) 23 | self.fc2 = nn.Linear(400 + action_dims, 300) 24 | self.q_out = nn.Linear(300, 1) 25 | 26 | def forward(self, x, actions): 27 | x = F.relu(self.fc1(x)) 28 | x = torch.cat([x, actions], dim=1) 29 | x = F.relu(self.fc2(x)) 30 | q_value = self.q_out(x) 31 | return q_value 32 | -------------------------------------------------------------------------------- /rl_algorithms/dqn_algos/demo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from arguments import get_args 3 | from models import net 4 | import torch 5 | from rl_utils.env_wrapper.atari_wrapper import make_atari, wrap_deepmind 6 | 7 | def get_tensors(obs): 8 | obs = np.transpose(obs, (2, 0, 1)) 9 | obs = np.expand_dims(obs, 0) 10 | obs = torch.tensor(obs, dtype=torch.float32) 11 | return obs 12 | 13 | if __name__ == '__main__': 14 | args = get_args() 15 | # create the environment 16 | env = make_atari(args.env_name) 17 | env = wrap_deepmind(env, frame_stack=True) 18 | # create the network 19 | net = net(env.action_space.n, args.use_dueling) 20 | # model path 21 | model_path = args.save_dir + args.env_name + '/model.pt' 22 | # load the models 23 | net.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage)) 24 | # start to test the demo 25 | obs = env.reset() 26 | for _ in range(2000): 27 | env.render() 28 | with torch.no_grad(): 29 | obs_tensor = get_tensors(obs) 30 | action_value = net(obs_tensor) 31 | action = torch.argmax(action_value.squeeze()).item() 32 | obs, reward, done, _ = env.step(action) 33 | if done: 34 | obs = env.reset() 35 | env.close() 36 | -------------------------------------------------------------------------------- /rl_utils/env_wrapper/frame_stack.py: -------------------------------------------------------------------------------- 1 | from rl_utils.env_wrapper import VecEnvWrapper 2 | import numpy as np 3 | from gym import spaces 4 | 5 | 6 | class VecFrameStack(VecEnvWrapper): 7 | def __init__(self, venv, nstack): 8 | self.venv = venv 9 | self.nstack = nstack 10 | wos = venv.observation_space # wrapped ob space 11 | low = np.repeat(wos.low, self.nstack, axis=-1) 12 | high = np.repeat(wos.high, self.nstack, axis=-1) 13 | self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype) 14 | observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype) 15 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space) 16 | 17 | def step_wait(self): 18 | obs, rews, news, infos = self.venv.step_wait() 19 | self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1) 20 | for (i, new) in enumerate(news): 21 | if new: 22 | self.stackedobs[i] = 0 23 | self.stackedobs[..., -obs.shape[-1]:] = obs 24 | return self.stackedobs, rews, news, infos 25 | 26 | def reset(self): 27 | obs = self.venv.reset() 28 | self.stackedobs[...] = 0 29 | self.stackedobs[..., -obs.shape[-1]:] = obs 30 | return self.stackedobs 31 | -------------------------------------------------------------------------------- /rl_algorithms/a2c/demo.py: -------------------------------------------------------------------------------- 1 | from arguments import get_args 2 | from models import net 3 | import torch 4 | from utils import select_actions 5 | import cv2 6 | import numpy as np 7 | from rl_utils.env_wrapper.frame_stack import VecFrameStack 8 | from rl_utils.env_wrapper.atari_wrapper import make_atari, wrap_deepmind 9 | 10 | # update the current observation 11 | def get_tensors(obs): 12 | input_tensor = torch.tensor(np.transpose(obs, (2, 0, 1)), dtype=torch.float32).unsqueeze(0) 13 | return input_tensor 14 | 15 | if __name__ == "__main__": 16 | args = get_args() 17 | # create environment 18 | #env = VecFrameStack(wrap_deepmind(make_atari(args.env_name)), 4) 19 | env = make_atari(args.env_name) 20 | env = wrap_deepmind(env, frame_stack=True) 21 | # get the model path 22 | model_path = args.save_dir + args.env_name + '/model.pt' 23 | network = net(env.action_space.n) 24 | network.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage)) 25 | obs = env.reset() 26 | while True: 27 | env.render() 28 | # get the obs 29 | with torch.no_grad(): 30 | input_tensor = get_tensors(obs) 31 | _, pi = network(input_tensor) 32 | actions = select_actions(pi, True) 33 | obs, reward, done, _ = env.step([actions]) 34 | env.close() 35 | -------------------------------------------------------------------------------- /rl_algorithms/ppo/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.distributions.normal import Normal 4 | from torch.distributions.beta import Beta 5 | from torch.distributions.categorical import Categorical 6 | import random 7 | 8 | def select_actions(pi, dist_type, env_type): 9 | if env_type == 'atari': 10 | actions = Categorical(pi).sample() 11 | else: 12 | if dist_type == 'gauss': 13 | mean, std = pi 14 | actions = Normal(mean, std).sample() 15 | elif dist_type == 'beta': 16 | alpha, beta = pi 17 | actions = Beta(alpha.detach().cpu(), beta.detach().cpu()).sample() 18 | # return actions 19 | return actions.detach().cpu().numpy().squeeze() 20 | 21 | def evaluate_actions(pi, actions, dist_type, env_type): 22 | if env_type == 'atari': 23 | cate_dist = Categorical(pi) 24 | log_prob = cate_dist.log_prob(actions).unsqueeze(-1) 25 | entropy = cate_dist.entropy().mean() 26 | else: 27 | if dist_type == 'gauss': 28 | mean, std = pi 29 | normal_dist = Normal(mean, std) 30 | log_prob = normal_dist.log_prob(actions).sum(dim=1, keepdim=True) 31 | entropy = normal_dist.entropy().mean() 32 | elif dist_type == 'beta': 33 | alpha, beta = pi 34 | beta_dist = Beta(alpha, beta) 35 | log_prob = beta_dist.log_prob(actions).sum(dim=1, keepdim=True) 36 | entropy = beta_dist.entropy().mean() 37 | return log_prob, entropy 38 | -------------------------------------------------------------------------------- /rl_algorithms/trpo/demo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import gym 4 | from arguments import get_args 5 | from models import network 6 | 7 | def denormalize(x, mean, std, clip=10): 8 | x -= mean 9 | x /= (std + 1e-8) 10 | return np.clip(x, -clip, clip) 11 | 12 | def get_tensors(x): 13 | return torch.tensor(x, dtype=torch.float32).unsqueeze(0) 14 | 15 | if __name__ == '__main__': 16 | args = get_args() 17 | # create the environment 18 | env = gym.make(args.env_name) 19 | # build up the network 20 | net = network(env.observation_space.shape[0], env.action_space.shape[0]) 21 | # load the saved model 22 | model_path = args.save_dir + args.env_name + '/model.pt' 23 | network_model, filters = torch.load(model_path, map_location=lambda storage, loc: storage) 24 | net.load_state_dict(network_model) 25 | net.eval() 26 | for _ in range(10): 27 | obs = denormalize(env.reset(), filters.rs.mean, filters.rs.std) 28 | reward_total = 0 29 | for _ in range(10000): 30 | env.render() 31 | obs_tensor = get_tensors(obs) 32 | with torch.no_grad(): 33 | _, (mean, _) = net(obs_tensor) 34 | action = mean.numpy().squeeze() 35 | obs, reward, done, _ = env.step(action) 36 | reward_total += reward 37 | obs = denormalize(obs, filters.rs.mean, filters.rs.std) 38 | if done: 39 | break 40 | print('the reward of this episode is: {}'.format(reward_total)) 41 | env.close() 42 | -------------------------------------------------------------------------------- /rl_utils/experience_replay/experience_replay.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | """ 5 | define the replay buffer and corresponding algorithms like PER 6 | 7 | """ 8 | 9 | class replay_buffer: 10 | def __init__(self, memory_size): 11 | self.storge = [] 12 | self.memory_size = memory_size 13 | self.next_idx = 0 14 | 15 | # add the samples 16 | def add(self, obs, action, reward, obs_, done): 17 | data = (obs, action, reward, obs_, done) 18 | if self.next_idx >= len(self.storge): 19 | self.storge.append(data) 20 | else: 21 | self.storge[self.next_idx] = data 22 | # get the next idx 23 | self.next_idx = (self.next_idx + 1) % self.memory_size 24 | 25 | # encode samples 26 | def _encode_sample(self, idx): 27 | obses, actions, rewards, obses_, dones = [], [], [], [], [] 28 | for i in idx: 29 | data = self.storge[i] 30 | obs, action, reward, obs_, done = data 31 | obses.append(np.array(obs, copy=False)) 32 | actions.append(np.array(action, copy=False)) 33 | rewards.append(reward) 34 | obses_.append(np.array(obs_, copy=False)) 35 | dones.append(done) 36 | return np.array(obses), np.array(actions), np.array(rewards), np.array(obses_), np.array(dones) 37 | 38 | # sample from the memory 39 | def sample(self, batch_size): 40 | idxes = [random.randint(0, len(self.storge) - 1) for _ in range(batch_size)] 41 | return self._encode_sample(idxes) 42 | -------------------------------------------------------------------------------- /rl_algorithms/trpo/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | class network(nn.Module): 6 | def __init__(self, num_states, num_actions): 7 | super(network, self).__init__() 8 | # define the critic 9 | self.critic = critic(num_states) 10 | self.actor = actor(num_states, num_actions) 11 | 12 | def forward(self, x): 13 | state_value = self.critic(x) 14 | pi = self.actor(x) 15 | return state_value, pi 16 | 17 | class critic(nn.Module): 18 | def __init__(self, num_states): 19 | super(critic, self).__init__() 20 | self.fc1 = nn.Linear(num_states, 64) 21 | self.fc2 = nn.Linear(64, 64) 22 | self.value = nn.Linear(64, 1) 23 | 24 | def forward(self, x): 25 | x = F.tanh(self.fc1(x)) 26 | x = F.tanh(self.fc2(x)) 27 | value = self.value(x) 28 | return value 29 | 30 | class actor(nn.Module): 31 | def __init__(self, num_states, num_actions): 32 | super(actor, self).__init__() 33 | self.fc1 = nn.Linear(num_states, 64) 34 | self.fc2 = nn.Linear(64, 64) 35 | self.action_mean = nn.Linear(64, num_actions) 36 | self.sigma_log = nn.Parameter(torch.zeros(1, num_actions)) 37 | 38 | def forward(self, x): 39 | x = F.tanh(self.fc1(x)) 40 | x = F.tanh(self.fc2(x)) 41 | mean = self.action_mean(x) 42 | sigma_log = self.sigma_log.expand_as(mean) 43 | sigma = torch.exp(sigma_log) 44 | pi = (mean, sigma) 45 | 46 | return pi 47 | -------------------------------------------------------------------------------- /rl_algorithms/sac/demo.py: -------------------------------------------------------------------------------- 1 | from arguments import get_args 2 | import gym 3 | import torch 4 | import numpy as np 5 | from models import tanh_gaussian_actor 6 | 7 | if __name__ == '__main__': 8 | args = get_args() 9 | env = gym.make(args.env_name) 10 | # get environment infos 11 | obs_dims = env.observation_space.shape[0] 12 | action_dims = env.action_space.shape[0] 13 | action_max = env.action_space.high[0] 14 | # define the network 15 | actor_net = tanh_gaussian_actor(obs_dims, action_dims, args.hidden_size, args.log_std_min, args.log_std_max) 16 | # load models 17 | model_path = args.save_dir + args.env_name + '/model.pt' 18 | # load the network weights 19 | actor_net.load_state_dict(torch.load(model_path, map_location='cpu')) 20 | for ep in range(5): 21 | obs = env.reset() 22 | reward_sum = 0 23 | # set the maximum timesteps here... 24 | for _ in range(1000): 25 | env.render() 26 | with torch.no_grad(): 27 | obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0) 28 | mean, std = actor_net(obs_tensor) 29 | actions = torch.tanh(mean).detach().numpy().squeeze() 30 | if action_dims == 1: 31 | actions = np.array([actions]) 32 | obs_, reward, done, _ = env.step(action_max * actions) 33 | reward_sum += reward 34 | if done: 35 | break 36 | obs = obs_ 37 | print('the episode is: {}, the reward is: {}'.format(ep, reward_sum)) 38 | env.close() 39 | -------------------------------------------------------------------------------- /rl_algorithms/trpo/arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def get_args(): 4 | parse = argparse.ArgumentParser() 5 | parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor of the RL') 6 | parse.add_argument('--env-name', type=str, default='Walker2d-v2', help='the training environment') 7 | parse.add_argument('--seed', type=int, default=123, help='the random seed') 8 | parse.add_argument('--save-dir', type=str, default='saved_models/', help='the folder to save models') 9 | parse.add_argument('--total-timesteps', type=int, default=int(1e6), help='the total frames') 10 | parse.add_argument('--nsteps', type=int, default=1024, help='the steps to collect samples') 11 | parse.add_argument('--lr', type=float, default=3e-4) 12 | parse.add_argument('--batch-size', type=int, default=64, help='the mini batch size ot update the value function') 13 | parse.add_argument('--vf-itrs', type=int, default=5, help='the times to update the value network') 14 | parse.add_argument('--tau', type=float, default=0.95, help='the param to calculate the gae') 15 | parse.add_argument('--damping', type=float, default=0.1, help='the damping coeffificent') 16 | parse.add_argument('--max-kl', type=float, default=0.01, help='the max kl divergence') 17 | parse.add_argument('--cuda', action='store_true', help='if use gpu') 18 | parse.add_argument('--env-type', type=str, default='mujoco', help='the environment type') 19 | parse.add_argument('--log-dir', type=str, default='logs', help='folder to save log files') 20 | 21 | args = parse.parse_args() 22 | 23 | return args 24 | -------------------------------------------------------------------------------- /rl_utils/mpi_utils/utils.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import numpy as np 3 | import torch 4 | 5 | # sync_networks across the different cores 6 | def sync_networks(network): 7 | """ 8 | netowrk is the network you want to sync 9 | 10 | """ 11 | comm = MPI.COMM_WORLD 12 | flat_params = _get_flat_params_or_grads(network, mode='params') 13 | comm.Bcast(flat_params, root=0) 14 | # set the flat params back to the network 15 | _set_flat_params_or_grads(network, flat_params, mode='params') 16 | 17 | def sync_grads(network): 18 | flat_grads = _get_flat_params_or_grads(network, mode='grads') 19 | comm = MPI.COMM_WORLD 20 | global_grads = np.zeros_like(flat_grads) 21 | comm.Allreduce(flat_grads, global_grads, op=MPI.SUM) 22 | _set_flat_params_or_grads(network, global_grads, mode='grads') 23 | 24 | # get the flat grads or params 25 | def _get_flat_params_or_grads(network, mode='params'): 26 | """ 27 | include two kinds: grads and params 28 | 29 | """ 30 | attr = 'data' if mode == 'params' else 'grad' 31 | return np.concatenate([getattr(param, attr).cpu().numpy().flatten() for param in network.parameters()]) 32 | 33 | def _set_flat_params_or_grads(network, flat_params, mode='params'): 34 | """ 35 | include two kinds: grads and params 36 | 37 | """ 38 | attr = 'data' if mode == 'params' else 'grad' 39 | # the pointer 40 | pointer = 0 41 | for param in network.parameters(): 42 | getattr(param, attr).copy_(torch.tensor(flat_params[pointer:pointer + param.data.numel()]).view_as(param.data)) 43 | pointer += param.data.numel() 44 | -------------------------------------------------------------------------------- /rl_algorithms/ddpg/demo.py: -------------------------------------------------------------------------------- 1 | from arguments import get_args 2 | import gym 3 | from models import actor 4 | import torch 5 | import numpy as np 6 | 7 | def normalize(obs, mean, std, clip): 8 | return np.clip((obs - mean) / std, -clip, clip) 9 | 10 | if __name__ == '__main__': 11 | args = get_args() 12 | env = gym.make(args.env_name) 13 | # get environment infos 14 | obs_dims = env.observation_space.shape[0] 15 | action_dims = env.action_space.shape[0] 16 | action_max = env.action_space.high[0] 17 | # define the network 18 | actor_net = actor(obs_dims, action_dims) 19 | # load models 20 | model_path = args.save_dir + args.env_name + '/model.pt' 21 | model, mean, std = torch.load(model_path, map_location=lambda storage, loc: storage) 22 | # load models into the network 23 | actor_net.load_state_dict(model) 24 | for ep in range(10): 25 | obs = env.reset() 26 | reward_sum = 0 27 | while True: 28 | env.render() 29 | with torch.no_grad(): 30 | norm_obs = normalize(obs, mean, std, args.clip_range) 31 | norm_obs_tensor = torch.tensor(norm_obs, dtype=torch.float32).unsqueeze(0) 32 | actions = actor_net(norm_obs_tensor) 33 | actions = actions.detach().numpy().squeeze() 34 | if action_dims == 1: 35 | actions = np.array([actions]) 36 | obs_, reward, done, _ = env.step(action_max * actions) 37 | reward_sum += reward 38 | if done: 39 | break 40 | obs = obs_ 41 | print('the episode is: {}, the reward is: {}'.format(ep, reward_sum)) 42 | env.close() 43 | -------------------------------------------------------------------------------- /rl_algorithms/dqn_algos/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | # linear exploration schedule 5 | class linear_schedule: 6 | def __init__(self, total_timesteps, final_ratio, init_ratio=1.0): 7 | self.total_timesteps = total_timesteps 8 | self.final_ratio = final_ratio 9 | self.init_ratio = init_ratio 10 | 11 | def get_value(self, timestep): 12 | frac = min(float(timestep) / self.total_timesteps, 1.0) 13 | return self.init_ratio - frac * (self.init_ratio - self.final_ratio) 14 | 15 | # select actions 16 | def select_actions(action_value, explore_eps): 17 | action_value = action_value.cpu().numpy().squeeze() 18 | # select actions 19 | action = np.argmax(action_value) if random.random() > explore_eps else np.random.randint(action_value.shape[0]) 20 | return action 21 | 22 | # record the reward info of the dqn experiments 23 | class reward_recorder: 24 | def __init__(self, history_length=100): 25 | self.history_length = history_length 26 | # the empty buffer to store rewards 27 | self.buffer = [0.0] 28 | self._episode_length = 1 29 | 30 | # add rewards 31 | def add_rewards(self, reward): 32 | self.buffer[-1] += reward 33 | 34 | # start new episode 35 | def start_new_episode(self): 36 | if self.get_length >= self.history_length: 37 | self.buffer.pop(0) 38 | # append new one 39 | self.buffer.append(0.0) 40 | self._episode_length += 1 41 | 42 | # get length of buffer 43 | @property 44 | def get_length(self): 45 | return len(self.buffer) 46 | 47 | @property 48 | def mean(self): 49 | return np.mean(self.buffer) 50 | 51 | # get the length of total episodes 52 | @property 53 | def num_episodes(self): 54 | return self._episode_length 55 | -------------------------------------------------------------------------------- /rl_algorithms/sac/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | # the flatten mlp 6 | class flatten_mlp(nn.Module): 7 | #TODO: add the initialization method for it 8 | def __init__(self, input_dims, hidden_size, action_dims=None): 9 | super(flatten_mlp, self).__init__() 10 | self.fc1 = nn.Linear(input_dims, hidden_size) if action_dims is None else nn.Linear(input_dims + action_dims, hidden_size) 11 | self.fc2 = nn.Linear(hidden_size, hidden_size) 12 | self.q_value = nn.Linear(hidden_size, 1) 13 | 14 | def forward(self, obs, action=None): 15 | inputs = torch.cat([obs, action], dim=1) if action is not None else obs 16 | x = F.relu(self.fc1(inputs)) 17 | x = F.relu(self.fc2(x)) 18 | output = self.q_value(x) 19 | return output 20 | 21 | # define the policy network - tanh gaussian policy network 22 | # TODO: Not use the log std 23 | class tanh_gaussian_actor(nn.Module): 24 | def __init__(self, input_dims, action_dims, hidden_size, log_std_min, log_std_max): 25 | super(tanh_gaussian_actor, self).__init__() 26 | self.fc1 = nn.Linear(input_dims, hidden_size) 27 | self.fc2 = nn.Linear(hidden_size, hidden_size) 28 | self.mean = nn.Linear(hidden_size, action_dims) 29 | self.log_std = nn.Linear(hidden_size, action_dims) 30 | # the log_std_min and log_std_max 31 | self.log_std_min = log_std_min 32 | self.log_std_max = log_std_max 33 | 34 | def forward(self, obs): 35 | x = F.relu(self.fc1(obs)) 36 | x = F.relu(self.fc2(x)) 37 | mean = self.mean(x) 38 | log_std = self.log_std(x) 39 | # clamp the log std 40 | log_std = torch.clamp(log_std, min=self.log_std_min, max=self.log_std_max) 41 | # the reparameterization trick 42 | # return mean and std 43 | return (mean, torch.exp(log_std)) 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # DS Store 107 | .DS_Store 108 | 109 | #saved_model 110 | *.pth 111 | 112 | *.pt 113 | 114 | *.log 115 | 116 | *.txt 117 | *.csv 118 | logs/ 119 | -------------------------------------------------------------------------------- /rl_algorithms/a2c/arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def get_args(): 4 | parse = argparse.ArgumentParser() 5 | parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor of RL') 6 | parse.add_argument('--seed', type=int, default=123, help='the random seeds') 7 | parse.add_argument('--env-name', type=str, default='BreakoutNoFrameskip-v4', help='the environment name') 8 | parse.add_argument('--lr', type=float, default=7e-4, help='learning rate of the algorithm') 9 | parse.add_argument('--value-loss-coef', type=float, default=0.5, help='the coefficient of value loss') 10 | parse.add_argument('--tau', type=float, default=0.95, help='gae coefficient') 11 | parse.add_argument('--cuda', action='store_true', help='use cuda do the training') 12 | parse.add_argument('--total-frames', type=int, default=20000000, help='the total frames for training') 13 | parse.add_argument('--eps', type=float, default=1e-5, help='param for adam optimizer') 14 | parse.add_argument('--save-dir', type=str, default='saved_models/', help='the folder to save models') 15 | parse.add_argument('--nsteps', type=int, default=5, help='the steps to update the network') 16 | parse.add_argument('--num-workers', type=int, default=16, help='the number of cpu you use') 17 | parse.add_argument('--entropy-coef', type=float, default=0.01, help='entropy-reg') 18 | parse.add_argument('--log-interval', type=int, default=100, help='the log interval') 19 | parse.add_argument('--alpha', type=float, default=0.99, help='the alpha coe of RMSprop') 20 | parse.add_argument('--max-grad-norm', type=float, default=0.5, help='the grad clip') 21 | parse.add_argument('--use-gae', action='store_true', help='use-gae') 22 | parse.add_argument('--log-dir', type=str, default='logs', help='log dir') 23 | parse.add_argument('--env-type', type=str, default='atari', help='the type of the environment') 24 | 25 | args = parse.parse_args() 26 | 27 | return args 28 | -------------------------------------------------------------------------------- /rl_utils/running_filter/running_filter.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import numpy as np 3 | 4 | # this is from the https://github.com/ikostrikov/pytorch-trpo/blob/master/running_state.py 5 | 6 | # from https://github.com/joschu/modular_rl 7 | # http://www.johndcook.com/blog/standard_deviation/ 8 | class RunningStat(object): 9 | def __init__(self, shape): 10 | self._n = 0 11 | self._M = np.zeros(shape) 12 | self._S = np.zeros(shape) 13 | 14 | def push(self, x): 15 | x = np.asarray(x) 16 | assert x.shape == self._M.shape 17 | self._n += 1 18 | if self._n == 1: 19 | self._M[...] = x 20 | else: 21 | oldM = self._M.copy() 22 | self._M[...] = oldM + (x - oldM) / self._n 23 | self._S[...] = self._S + (x - oldM) * (x - self._M) 24 | 25 | @property 26 | def n(self): 27 | return self._n 28 | 29 | @property 30 | def mean(self): 31 | return self._M 32 | 33 | @property 34 | def var(self): 35 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M) 36 | 37 | @property 38 | def std(self): 39 | return np.sqrt(self.var) 40 | 41 | @property 42 | def shape(self): 43 | return self._M.shape 44 | 45 | 46 | class ZFilter: 47 | """ 48 | y = (x-mean)/std 49 | using running estimates of mean,std 50 | """ 51 | 52 | def __init__(self, shape, demean=True, destd=True, clip=10.0): 53 | self.demean = demean 54 | self.destd = destd 55 | self.clip = clip 56 | 57 | self.rs = RunningStat(shape) 58 | 59 | def __call__(self, x, update=True): 60 | if update: self.rs.push(x) 61 | if self.demean: 62 | x = x - self.rs.mean 63 | if self.destd: 64 | x = x / (self.rs.std + 1e-8) 65 | if self.clip: 66 | x = np.clip(x, -self.clip, self.clip) 67 | return x 68 | 69 | def output_shape(self, input_space): 70 | return input_space.shape 71 | -------------------------------------------------------------------------------- /rl_algorithms/a2c/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | # the convolution layer of deepmind 6 | class deepmind(nn.Module): 7 | def __init__(self): 8 | super(deepmind, self).__init__() 9 | self.conv1 = nn.Conv2d(4, 32, 8, stride=4) 10 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 11 | self.conv3 = nn.Conv2d(64, 32, 3, stride=1) 12 | self.fc1 = nn.Linear(32 * 7 * 7, 512) 13 | # start to do the init... 14 | nn.init.orthogonal_(self.conv1.weight.data, gain=nn.init.calculate_gain('relu')) 15 | nn.init.orthogonal_(self.conv2.weight.data, gain=nn.init.calculate_gain('relu')) 16 | nn.init.orthogonal_(self.conv3.weight.data, gain=nn.init.calculate_gain('relu')) 17 | nn.init.orthogonal_(self.fc1.weight.data, gain=nn.init.calculate_gain('relu')) 18 | # init the bias... 19 | nn.init.constant_(self.conv1.bias.data, 0) 20 | nn.init.constant_(self.conv2.bias.data, 0) 21 | nn.init.constant_(self.conv3.bias.data, 0) 22 | nn.init.constant_(self.fc1.bias.data, 0) 23 | 24 | def forward(self, x): 25 | x = F.relu(self.conv1(x)) 26 | x = F.relu(self.conv2(x)) 27 | x = F.relu(self.conv3(x)) 28 | x = x.view(-1, 32 * 7 * 7) 29 | x = F.relu(self.fc1(x)) 30 | return x 31 | 32 | # in the initial, just the nature CNN 33 | class net(nn.Module): 34 | def __init__(self, num_actions): 35 | super(net, self).__init__() 36 | self.cnn_layer = deepmind() 37 | self.critic = nn.Linear(512, 1) 38 | self.actor = nn.Linear(512, num_actions) 39 | # init the linear layer.. 40 | nn.init.orthogonal_(self.critic.weight.data) 41 | nn.init.constant_(self.critic.bias.data, 0) 42 | # init the policy layer... 43 | nn.init.orthogonal_(self.actor.weight.data, gain=0.01) 44 | nn.init.constant_(self.actor.bias.data, 0) 45 | 46 | def forward(self, inputs): 47 | x = self.cnn_layer(inputs / 255.0) 48 | value = self.critic(x) 49 | pi = F.softmax(self.actor(x), dim=1) 50 | return value, pi 51 | -------------------------------------------------------------------------------- /rl_algorithms/ddpg/arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def get_args(): 4 | parse = argparse.ArgumentParser(description='ddpg') 5 | parse.add_argument('--env-name', type=str, default='Pendulum-v0', help='the training environment') 6 | parse.add_argument('--lr-actor', type=float, default=1e-4, help='the lr of the actor') 7 | parse.add_argument('--lr-critic', type=float, default=1e-3, help='the lr of the critic') 8 | parse.add_argument('--critic-l2-reg', type=float, default=1e-2, help='the critic reg') 9 | parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor') 10 | parse.add_argument('--nb-epochs', type=int, default=500, help='the epochs to train the network') 11 | parse.add_argument('--nb-cycles', type=int, default=20) 12 | parse.add_argument('--nb-train', type=int, default=50, help='number to train the agent') 13 | parse.add_argument('--nb-rollout-steps', type=int, default=100, help='steps to collect samples') 14 | parse.add_argument('--nb-test-rollouts', type=int, default=10, help='the number of test') 15 | parse.add_argument('--batch-size', type=int, default=128, help='the batch size to update network') 16 | parse.add_argument('--replay-size', type=int, default=int(1e6), help='the size of the replay buffer') 17 | parse.add_argument('--clip-range', type=float, default=5, help='clip range of the observation') 18 | parse.add_argument('--save-dir', type=str, default='saved_models/', help='the place save the models') 19 | parse.add_argument('--polyak', type=float, default=0.95, help='the expoential weighted coefficient.') 20 | parse.add_argument('--total-frames', type=int, default=int(1e6), help='total frames') 21 | parse.add_argument('--log-dir', type=str, default='logs', help='place to save log files') 22 | parse.add_argument('--env-type', type=str, default=None, help='environment type') 23 | parse.add_argument('--seed', type=int, default=123, help='random seed') 24 | parse.add_argument('--display-interval', type=int, default=10, help='interval to display') 25 | # ddpg not support gpu 26 | parse.add_argument('--cuda', action='store_true', help='if use GPU') 27 | 28 | args = parse.parse_args() 29 | return args 30 | -------------------------------------------------------------------------------- /rl_algorithms/trpo/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.distributions.normal import Normal 4 | 5 | # select actions 6 | def select_actions(pi): 7 | mean, std = pi 8 | normal_dist = Normal(mean, std) 9 | return normal_dist.sample().detach().numpy().squeeze() 10 | 11 | # evaluate the actions 12 | def eval_actions(pi, actions): 13 | mean, std = pi 14 | normal_dist = Normal(mean, std) 15 | return normal_dist.log_prob(actions).sum(dim=1, keepdim=True) 16 | 17 | # conjugated gradient 18 | def conjugated_gradient(fvp, b, update_steps, obs, pi_old, residual_tol=1e-10): 19 | # the initial solution is zero 20 | x = torch.zeros(b.size(), dtype=torch.float32) 21 | r = b.clone() 22 | p = b.clone() 23 | rdotr = torch.dot(r, r) 24 | for i in range(update_steps): 25 | fv_product = fvp(p, obs, pi_old) 26 | alpha = rdotr / torch.dot(p, fv_product) 27 | x = x + alpha * p 28 | r = r - alpha * fv_product 29 | new_rdotr = torch.dot(r, r) 30 | beta = new_rdotr / rdotr 31 | p = r + beta * p 32 | rdotr = new_rdotr 33 | # if less than residual tot.. break 34 | if rdotr < residual_tol: 35 | break 36 | return x 37 | 38 | # line search 39 | def line_search(model, loss_fn, x, full_step, expected_rate, obs, adv, actions, pi_old, max_backtracks=10, accept_ratio=0.1): 40 | fval = loss_fn(obs, adv, actions, pi_old).data 41 | for (_n_backtracks, stepfrac) in enumerate(0.5**np.arange(max_backtracks)): 42 | xnew = x + stepfrac * full_step 43 | set_flat_params_to(model, xnew) 44 | new_fval = loss_fn(obs, adv, actions, pi_old).data 45 | actual_improve = fval - new_fval 46 | expected_improve = expected_rate * stepfrac 47 | ratio = actual_improve / expected_improve 48 | if ratio.item() > accept_ratio and actual_improve.item() > 0: 49 | return True, xnew 50 | return False, x 51 | 52 | 53 | def set_flat_params_to(model, flat_params): 54 | prev_indx = 0 55 | for param in model.parameters(): 56 | flat_size = int(np.prod(list(param.size()))) 57 | param.data.copy_(flat_params[prev_indx:prev_indx + flat_size].view(param.size())) 58 | prev_indx += flat_size 59 | -------------------------------------------------------------------------------- /rl_algorithms/dqn_algos/arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def get_args(): 4 | parse = argparse.ArgumentParser() 5 | parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor of RL') 6 | parse.add_argument('--seed', type=int, default=123, help='the random seeds') 7 | parse.add_argument('--env-name', type=str, default='PongNoFrameskip-v4', help='the environment name') 8 | parse.add_argument('--batch-size', type=int, default=32, help='the batch size of updating') 9 | parse.add_argument('--lr', type=float, default=1e-4, help='learning rate of the algorithm') 10 | parse.add_argument('--buffer-size', type=int, default=10000, help='the size of the buffer') 11 | parse.add_argument('--cuda', action='store_true', help='if use the gpu') 12 | parse.add_argument('--init-ratio', type=float, default=1, help='the initial exploration ratio') 13 | parse.add_argument('--exploration_fraction', type=float, default=0.1, help='decide how many steps to do the exploration') 14 | parse.add_argument('--final-ratio', type=float, default=0.01, help='the final exploration ratio') 15 | parse.add_argument('--grad-norm-clipping', type=float, default=10, help='the gradient clipping') 16 | parse.add_argument('--total-timesteps', type=int, default=int(1e7), help='the total timesteps to train network') 17 | parse.add_argument('--learning-starts', type=int, default=10000, help='the frames start to learn') 18 | parse.add_argument('--train-freq', type=int, default=4, help='the frequency to update the network') 19 | parse.add_argument('--target-network-update-freq', type=int, default=1000, help='the frequency to update the target network') 20 | parse.add_argument('--save-dir', type=str, default='saved_models/', help='the folder to save models') 21 | parse.add_argument('--display-interval', type=int, default=10, help='the display interval') 22 | parse.add_argument('--env-type', type=str, default='atari', help='the environment type') 23 | parse.add_argument('--log-dir', type=str, default='logs', help='dir to save log information') 24 | parse.add_argument('--use-double-net', action='store_true', help='use double dqn to train the agent') 25 | parse.add_argument('--use-dueling', action='store_true', help='use dueling to train the agent') 26 | 27 | args = parse.parse_args() 28 | 29 | return args 30 | -------------------------------------------------------------------------------- /rl_algorithms/ppo/arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def get_args(): 4 | parse = argparse.ArgumentParser() 5 | parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor of RL') 6 | parse.add_argument('--seed', type=int, default=123, help='the random seeds') 7 | parse.add_argument('--num-workers', type=int, default=8, help='the number of workers to collect samples') 8 | parse.add_argument('--env-name', type=str, default='PongNoFrameskip-v4', help='the environment name') 9 | parse.add_argument('--batch-size', type=int, default=4, help='the batch size of updating') 10 | parse.add_argument('--lr', type=float, default=2.5e-4, help='learning rate of the algorithm') 11 | parse.add_argument('--epoch', type=int, default=4, help='the epoch during training') 12 | parse.add_argument('--nsteps', type=int, default=128, help='the steps to collect samples') 13 | parse.add_argument('--vloss-coef', type=float, default=0.5, help='the coefficient of value loss') 14 | parse.add_argument('--ent-coef', type=float, default=0.01, help='the entropy loss coefficient') 15 | parse.add_argument('--tau', type=float, default=0.95, help='gae coefficient') 16 | parse.add_argument('--cuda', action='store_true', help='use cuda do the training') 17 | parse.add_argument('--total-frames', type=int, default=20000000, help='the total frames for training') 18 | parse.add_argument('--dist', type=str, default='gauss', help='the distributions for sampling actions') 19 | parse.add_argument('--eps', type=float, default=1e-5, help='param for adam optimizer') 20 | parse.add_argument('--clip', type=float, default=0.1, help='the ratio clip param') 21 | parse.add_argument('--save-dir', type=str, default='saved_models/', help='the folder to save models') 22 | parse.add_argument('--lr-decay', action='store_true', help='if using the learning rate decay during decay') 23 | parse.add_argument('--max-grad-norm', type=float, default=0.5, help='grad norm') 24 | parse.add_argument('--display-interval', type=int, default=10, help='the interval that display log information') 25 | parse.add_argument('--env-type', type=str, default='atari', help='the type of the environment') 26 | parse.add_argument('--log-dir', type=str, default='logs', help='the folders to save the log files') 27 | 28 | args = parse.parse_args() 29 | 30 | return args 31 | -------------------------------------------------------------------------------- /rl_utils/env_wrapper/create_env.py: -------------------------------------------------------------------------------- 1 | from rl_utils.env_wrapper.atari_wrapper import make_atari, wrap_deepmind 2 | from rl_utils.env_wrapper.multi_envs_wrapper import SubprocVecEnv 3 | from rl_utils.env_wrapper.frame_stack import VecFrameStack 4 | from rl_utils.logger import logger, bench 5 | import os 6 | import gym 7 | 8 | """ 9 | this functions is to create the environments 10 | 11 | """ 12 | 13 | def create_single_env(args, rank=0): 14 | # setup the log files 15 | if rank == 0: 16 | if not os.path.exists(args.log_dir): 17 | os.mkdir(args.log_dir) 18 | log_path = args.log_dir + '/{}/'.format(args.env_name) 19 | logger.configure(log_path) 20 | # start to create environment 21 | if args.env_type == 'atari': 22 | # create the environment 23 | env = make_atari(args.env_name) 24 | # the monitor 25 | env = bench.Monitor(env, logger.get_dir()) 26 | # use the deepmind environment wrapper 27 | env = wrap_deepmind(env, frame_stack=True) 28 | else: 29 | env = gym.make(args.env_name) 30 | # add log information 31 | env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) 32 | # set seeds to the environment to make sure the reproducebility 33 | env.seed(args.seed + rank) 34 | return env 35 | 36 | # create multiple environments - for multiple 37 | def create_multiple_envs(args): 38 | # now only support the atari games 39 | if args.env_type == 'atari': 40 | def make_env(rank): 41 | def _thunk(): 42 | if not os.path.exists(args.log_dir): 43 | os.mkdir(args.log_dir) 44 | log_path = args.log_dir + '/{}/'.format(args.env_name) 45 | logger.configure(log_path) 46 | env = make_atari(args.env_name) 47 | # set the seed for the environment 48 | env.seed(args.seed + rank) 49 | # set loggler 50 | env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) 51 | # use the deepmind environment wrapper 52 | env = wrap_deepmind(env) 53 | return env 54 | return _thunk 55 | # put into sub processing 56 | envs = SubprocVecEnv([make_env(i) for i in range(args.num_workers)]) 57 | # then, frame stack 58 | envs = VecFrameStack(envs, 4) 59 | else: 60 | raise NotImplementedError 61 | return envs 62 | 63 | -------------------------------------------------------------------------------- /rl_algorithms/dqn_algos/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | # the convolution layer of deepmind 6 | class deepmind(nn.Module): 7 | def __init__(self): 8 | super(deepmind, self).__init__() 9 | self.conv1 = nn.Conv2d(4, 32, 8, stride=4) 10 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 11 | self.conv3 = nn.Conv2d(64, 32, 3, stride=1) 12 | 13 | # start to do the init... 14 | nn.init.orthogonal_(self.conv1.weight.data, gain=nn.init.calculate_gain('relu')) 15 | nn.init.orthogonal_(self.conv2.weight.data, gain=nn.init.calculate_gain('relu')) 16 | nn.init.orthogonal_(self.conv3.weight.data, gain=nn.init.calculate_gain('relu')) 17 | # init the bias... 18 | nn.init.constant_(self.conv1.bias.data, 0) 19 | nn.init.constant_(self.conv2.bias.data, 0) 20 | nn.init.constant_(self.conv3.bias.data, 0) 21 | 22 | def forward(self, x): 23 | x = F.relu(self.conv1(x)) 24 | x = F.relu(self.conv2(x)) 25 | x = F.relu(self.conv3(x)) 26 | x = x.view(-1, 32 * 7 * 7) 27 | 28 | return x 29 | 30 | # in the initial, just the nature CNN 31 | class net(nn.Module): 32 | def __init__(self, num_actions, use_dueling=False): 33 | super(net, self).__init__() 34 | # if use the dueling network 35 | self.use_dueling = use_dueling 36 | # define the network 37 | self.cnn_layer = deepmind() 38 | # if not use dueling 39 | if not self.use_dueling: 40 | self.fc1 = nn.Linear(32 * 7 * 7, 256) 41 | self.action_value = nn.Linear(256, num_actions) 42 | else: 43 | # the layer for dueling network architecture 44 | self.action_fc = nn.Linear(32 * 7 * 7, 256) 45 | self.state_value_fc = nn.Linear(32 * 7 * 7, 256) 46 | self.action_value = nn.Linear(256, num_actions) 47 | self.state_value = nn.Linear(256, 1) 48 | 49 | def forward(self, inputs): 50 | x = self.cnn_layer(inputs / 255.0) 51 | if not self.use_dueling: 52 | x = F.relu(self.fc1(x)) 53 | action_value_out = self.action_value(x) 54 | else: 55 | # get the action value 56 | action_fc = F.relu(self.action_fc(x)) 57 | action_value = self.action_value(action_fc) 58 | # get the state value 59 | state_value_fc = F.relu(self.state_value_fc(x)) 60 | state_value = self.state_value(state_value_fc) 61 | # action value mean 62 | action_value_mean = torch.mean(action_value, dim=1, keepdim=True) 63 | action_value_center = action_value - action_value_mean 64 | # Q = V + A 65 | action_value_out = state_value + action_value_center 66 | return action_value_out 67 | -------------------------------------------------------------------------------- /rl_algorithms/ppo/demo.py: -------------------------------------------------------------------------------- 1 | from arguments import get_args 2 | from models import cnn_net, mlp_net 3 | import torch 4 | import cv2 5 | import numpy as np 6 | import gym 7 | from rl_utils.env_wrapper.frame_stack import VecFrameStack 8 | from rl_utils.env_wrapper.atari_wrapper import make_atari, wrap_deepmind 9 | 10 | # denormalize 11 | def normalize(x, mean, std, clip=10): 12 | x -= mean 13 | x /= (std + 1e-8) 14 | return np.clip(x, -clip, clip) 15 | 16 | # get tensors for the agent 17 | def get_tensors(obs, env_type, filters=None): 18 | if env_type == 'atari': 19 | tensor = torch.tensor(np.transpose(obs, (2, 0, 1)), dtype=torch.float32).unsqueeze(0) 20 | elif env_type == 'mujoco': 21 | tensor = torch.tensor(normalize(obs, filters.rs.mean, filters.rs.std), dtype=torch.float32).unsqueeze(0) 22 | return tensor 23 | 24 | if __name__ == '__main__': 25 | # get the arguments 26 | args = get_args() 27 | # create the environment 28 | if args.env_type == 'atari': 29 | env = make_atari(args.env_name) 30 | env = wrap_deepmind(env, frame_stack=True) 31 | elif args.env_type == 'mujoco': 32 | env = gym.make(args.env_name) 33 | # get the model path 34 | model_path = args.save_dir + args.env_name + '/model.pt' 35 | # create the network 36 | if args.env_type == 'atari': 37 | network = cnn_net(env.action_space.n) 38 | network.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage)) 39 | filters = None 40 | elif args.env_type == 'mujoco': 41 | network = mlp_net(env.observation_space.shape[0], env.action_space.shape[0], args.dist) 42 | net_models, filters = torch.load(model_path, map_location=lambda storage, loc: storage) 43 | # load models 44 | network.load_state_dict(net_models) 45 | # start to play the demo 46 | obs = env.reset() 47 | reward_total = 0 48 | # just one episode 49 | while True: 50 | env.render() 51 | with torch.no_grad(): 52 | obs_tensor = get_tensors(obs, args.env_type, filters) 53 | _, pi = network(obs_tensor) 54 | # get actions 55 | if args.env_type == 'atari': 56 | actions = torch.argmax(pi, dim=1).item() 57 | elif args.env_type == 'mujoco': 58 | if args.dist == 'gauss': 59 | mean, _ = pi 60 | actions = mean.numpy().squeeze() 61 | elif args.dist == 'beta': 62 | alpha, beta = pi 63 | actions = (alpha - 1) / (alpha + beta - 2) 64 | actions = actions.numpy().squeeze() 65 | actions = -1 + 2 * actions 66 | obs_, reward, done, _ = env.step(actions) 67 | reward_total += reward 68 | if done: 69 | break 70 | obs = obs_ 71 | print('the rewrads is: {}'.format(reward_total)) 72 | -------------------------------------------------------------------------------- /rl_utils/mpi_utils/normalizer.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import numpy as np 3 | from mpi4py import MPI 4 | 5 | class normalizer: 6 | def __init__(self, size, eps=1e-2, default_clip_range=np.inf): 7 | self.size = size 8 | self.eps = eps 9 | self.default_clip_range = default_clip_range 10 | # some local information 11 | self.local_sum = np.zeros(self.size, np.float32) 12 | self.local_sumsq = np.zeros(self.size, np.float32) 13 | self.local_count = np.zeros(1, np.float32) 14 | # get the total sum sumsq and sum count 15 | self.total_sum = np.zeros(self.size, np.float32) 16 | self.total_sumsq = np.zeros(self.size, np.float32) 17 | self.total_count = np.ones(1, np.float32) 18 | # get the mean and std 19 | self.mean = np.zeros(self.size, np.float32) 20 | self.std = np.ones(self.size, np.float32) 21 | # thread locker 22 | self.lock = threading.Lock() 23 | 24 | # update the parameters of the normalizer 25 | def update(self, v): 26 | v = v.reshape(-1, self.size) 27 | # do the computing 28 | with self.lock: 29 | self.local_sum += v.sum(axis=0) 30 | self.local_sumsq += (np.square(v)).sum(axis=0) 31 | self.local_count[0] += v.shape[0] 32 | 33 | # sync the parameters across the cpus 34 | def sync(self, local_sum, local_sumsq, local_count): 35 | local_sum[...] = self._mpi_average(local_sum) 36 | local_sumsq[...] = self._mpi_average(local_sumsq) 37 | local_count[...] = self._mpi_average(local_count) 38 | return local_sum, local_sumsq, local_count 39 | 40 | def recompute_stats(self): 41 | with self.lock: 42 | local_count = self.local_count.copy() 43 | local_sum = self.local_sum.copy() 44 | local_sumsq = self.local_sumsq.copy() 45 | # reset 46 | self.local_count[...] = 0 47 | self.local_sum[...] = 0 48 | self.local_sumsq[...] = 0 49 | # synrc the stats 50 | sync_sum, sync_sumsq, sync_count = self.sync(local_sum, local_sumsq, local_count) 51 | # update the total stuff 52 | self.total_sum += sync_sum 53 | self.total_sumsq += sync_sumsq 54 | self.total_count += sync_count 55 | # calculate the new mean and std 56 | self.mean = self.total_sum / self.total_count 57 | self.std = np.sqrt(np.maximum(np.square(self.eps), (self.total_sumsq / self.total_count) - np.square(self.total_sum / self.total_count))) 58 | 59 | # average across the cpu's data 60 | def _mpi_average(self, x): 61 | buf = np.zeros_like(x) 62 | MPI.COMM_WORLD.Allreduce(x, buf, op=MPI.SUM) 63 | buf /= MPI.COMM_WORLD.Get_size() 64 | return buf 65 | 66 | # normalize the observation 67 | def normalize(self, v, clip_range=None): 68 | if clip_range is None: 69 | clip_range = self.default_clip_range 70 | return np.clip((v - self.mean) / (self.std), -clip_range, clip_range) 71 | -------------------------------------------------------------------------------- /rl_algorithms/sac/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.distributions.normal import Normal 4 | from torch.distributions import Distribution 5 | 6 | """ 7 | the tanhnormal distributions from rlkit may not stable 8 | 9 | """ 10 | class tanh_normal(Distribution): 11 | def __init__(self, normal_mean, normal_std, epsilon=1e-6, cuda=False): 12 | self.normal_mean = normal_mean 13 | self.normal_std = normal_std 14 | self.cuda = cuda 15 | self.normal = Normal(normal_mean, normal_std) 16 | self.epsilon = epsilon 17 | 18 | def sample_n(self, n, return_pre_tanh_value=False): 19 | z = self.normal.sample_n(n) 20 | if return_pre_tanh_value: 21 | return torch.tanh(z), z 22 | else: 23 | return torch.tanh(z) 24 | 25 | def log_prob(self, value, pre_tanh_value=None): 26 | """ 27 | :param value: some value, x 28 | :param pre_tanh_value: arctanh(x) 29 | :return: 30 | """ 31 | if pre_tanh_value is None: 32 | pre_tanh_value = torch.log((1 + value) / (1 - value)) / 2 33 | return self.normal.log_prob(pre_tanh_value) - torch.log(1 - value * value + self.epsilon) 34 | 35 | def sample(self, return_pretanh_value=False): 36 | """ 37 | Gradients will and should *not* pass through this operation. 38 | 39 | See https://github.com/pytorch/pytorch/issues/4620 for discussion. 40 | """ 41 | z = self.normal.sample().detach() 42 | if return_pretanh_value: 43 | return torch.tanh(z), z 44 | else: 45 | return torch.tanh(z) 46 | 47 | def rsample(self, return_pretanh_value=False): 48 | """ 49 | Sampling in the reparameterization case. 50 | """ 51 | sample_mean = torch.zeros(self.normal_mean.size(), dtype=torch.float32, device='cuda' if self.cuda else 'cpu') 52 | sample_std = torch.ones(self.normal_std.size(), dtype=torch.float32, device='cuda' if self.cuda else 'cpu') 53 | z = (self.normal_mean + self.normal_std * Normal(sample_mean, sample_std).sample()) 54 | z.requires_grad_() 55 | if return_pretanh_value: 56 | return torch.tanh(z), z 57 | else: 58 | return torch.tanh(z) 59 | 60 | # get action_infos 61 | class get_action_info: 62 | def __init__(self, pis, cuda=False): 63 | self.mean, self.std = pis 64 | self.dist = tanh_normal(normal_mean=self.mean, normal_std=self.std, cuda=cuda) 65 | 66 | # select actions 67 | def select_actions(self, exploration=True, reparameterize=True): 68 | if exploration: 69 | if reparameterize: 70 | actions, pretanh = self.dist.rsample(return_pretanh_value=True) 71 | return actions, pretanh 72 | else: 73 | actions = self.dist.sample() 74 | else: 75 | actions = torch.tanh(self.mean) 76 | return actions 77 | 78 | def get_log_prob(self, actions, pre_tanh_value): 79 | log_prob = self.dist.log_prob(actions, pre_tanh_value=pre_tanh_value) 80 | return log_prob.sum(dim=1, keepdim=True) 81 | -------------------------------------------------------------------------------- /rl_algorithms/sac/arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | # define the arguments that will be used in the SAC 4 | def get_args(): 5 | parse = argparse.ArgumentParser() 6 | parse.add_argument('--env-name', type=str, default='HalfCheetah-v2', help='the environment name') 7 | parse.add_argument('--cuda', action='store_true', help='use GPU do the training') 8 | parse.add_argument('--seed', type=int, default=123, help='the random seed to reproduce results') 9 | parse.add_argument('--hidden-size', type=int, default=256, help='the size of the hidden layer') 10 | parse.add_argument('--train-loop-per-epoch', type=int, default=1, help='the training loop per epoch') 11 | parse.add_argument('--q-lr', type=float, default=3e-4, help='the learning rate of the critic') 12 | parse.add_argument('--p-lr', type=float, default=3e-4, help='the learning rate of the actor') 13 | parse.add_argument('--n-epochs', type=int, default=int(3e3), help='the number of total epochs') 14 | parse.add_argument('--epoch-length', type=int, default=int(1e3), help='the lenght of each epoch') 15 | parse.add_argument('--n-updates', type=int, default=int(1e3), help='the number of training updates execute') 16 | parse.add_argument('--init-exploration-steps', type=int, default=int(1e3), help='the steps of the initial exploration') 17 | parse.add_argument('--init-exploration-policy', type=str, default='gaussian', help='the inital exploration policy') 18 | parse.add_argument('--buffer-size', type=int, default=int(1e6), help='the size of the replay buffer') 19 | parse.add_argument('--batch-size', type=int, default=256, help='the batch size of samples for training') 20 | parse.add_argument('--reward-scale', type=float, default=1, help='the reward scale') 21 | parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor') 22 | parse.add_argument('--log-std-max', type=float, default=2, help='the maximum log std value') 23 | parse.add_argument('--log-std-min', type=float, default=-20, help='the minimum log std value') 24 | parse.add_argument('--entropy-weights', type=float, default=0.2, help='the entropy weights') 25 | parse.add_argument('--tau', type=float, default=5e-3, help='the soft update coefficient') 26 | parse.add_argument('--target-update-interval', type=int, default=1, help='the interval to update target network') 27 | parse.add_argument('--update-cycles', type=int, default=int(1e3), help='how many updates apply in the update') 28 | parse.add_argument('--eval-episodes', type=int, default=10, help='the episodes that used for evaluation') 29 | parse.add_argument('--display-interval', type=int, default=1, help='the display interval') 30 | parse.add_argument('--save-dir', type=str, default='saved_models/', help='the place to save models') 31 | parse.add_argument('--reg', type=float, default=1e-3, help='the reg term') 32 | parse.add_argument('--auto-ent-tuning', action='store_true', help='tune the entorpy automatically') 33 | parse.add_argument('--log-dir', type=str, default='logs', help='dir to save log information') 34 | parse.add_argument('--env-type', type=str, default=None, help='environment type') 35 | 36 | return parse.parse_args() 37 | -------------------------------------------------------------------------------- /rl_algorithms/ppo/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | """ 6 | this network also include gaussian distribution and beta distribution 7 | 8 | """ 9 | 10 | class mlp_net(nn.Module): 11 | def __init__(self, state_size, num_actions, dist_type): 12 | super(mlp_net, self).__init__() 13 | self.dist_type = dist_type 14 | self.fc1_v = nn.Linear(state_size, 64) 15 | self.fc2_v = nn.Linear(64, 64) 16 | self.fc1_a = nn.Linear(state_size, 64) 17 | self.fc2_a = nn.Linear(64, 64) 18 | # check the type of distribution 19 | if self.dist_type == 'gauss': 20 | self.sigma_log = nn.Parameter(torch.zeros(1, num_actions)) 21 | self.action_mean = nn.Linear(64, num_actions) 22 | self.action_mean.weight.data.mul_(0.1) 23 | self.action_mean.bias.data.zero_() 24 | elif self.dist_type == 'beta': 25 | self.action_alpha = nn.Linear(64, num_actions) 26 | self.action_beta = nn.Linear(64, num_actions) 27 | # init.. 28 | self.action_alpha.weight.data.mul_(0.1) 29 | self.action_alpha.bias.data.zero_() 30 | self.action_beta.weight.data.mul_(0.1) 31 | self.action_beta.bias.data.zero_() 32 | 33 | # define layers to output state value 34 | self.value = nn.Linear(64, 1) 35 | self.value.weight.data.mul_(0.1) 36 | self.value.bias.data.zero_() 37 | 38 | def forward(self, x): 39 | x_v = torch.tanh(self.fc1_v(x)) 40 | x_v = torch.tanh(self.fc2_v(x_v)) 41 | state_value = self.value(x_v) 42 | # output the policy... 43 | x_a = torch.tanh(self.fc1_a(x)) 44 | x_a = torch.tanh(self.fc2_a(x_a)) 45 | if self.dist_type == 'gauss': 46 | mean = self.action_mean(x_a) 47 | sigma_log = self.sigma_log.expand_as(mean) 48 | sigma = torch.exp(sigma_log) 49 | pi = (mean, sigma) 50 | elif self.dist_type == 'beta': 51 | alpha = F.softplus(self.action_alpha(x_a)) + 1 52 | beta = F.softplus(self.action_beta(x_a)) + 1 53 | pi = (alpha, beta) 54 | 55 | return state_value, pi 56 | 57 | # the convolution layer of deepmind 58 | class deepmind(nn.Module): 59 | def __init__(self): 60 | super(deepmind, self).__init__() 61 | self.conv1 = nn.Conv2d(4, 32, 8, stride=4) 62 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 63 | self.conv3 = nn.Conv2d(64, 32, 3, stride=1) 64 | self.fc1 = nn.Linear(32 * 7 * 7, 512) 65 | # start to do the init... 66 | nn.init.orthogonal_(self.conv1.weight.data, gain=nn.init.calculate_gain('relu')) 67 | nn.init.orthogonal_(self.conv2.weight.data, gain=nn.init.calculate_gain('relu')) 68 | nn.init.orthogonal_(self.conv3.weight.data, gain=nn.init.calculate_gain('relu')) 69 | nn.init.orthogonal_(self.fc1.weight.data, gain=nn.init.calculate_gain('relu')) 70 | # init the bias... 71 | nn.init.constant_(self.conv1.bias.data, 0) 72 | nn.init.constant_(self.conv2.bias.data, 0) 73 | nn.init.constant_(self.conv3.bias.data, 0) 74 | nn.init.constant_(self.fc1.bias.data, 0) 75 | 76 | def forward(self, x): 77 | x = F.relu(self.conv1(x)) 78 | x = F.relu(self.conv2(x)) 79 | x = F.relu(self.conv3(x)) 80 | x = x.view(-1, 32 * 7 * 7) 81 | x = F.relu(self.fc1(x)) 82 | return x 83 | 84 | # in the initial, just the nature CNN 85 | class cnn_net(nn.Module): 86 | def __init__(self, num_actions): 87 | super(cnn_net, self).__init__() 88 | self.cnn_layer = deepmind() 89 | self.critic = nn.Linear(512, 1) 90 | self.actor = nn.Linear(512, num_actions) 91 | # init the linear layer.. 92 | nn.init.orthogonal_(self.critic.weight.data) 93 | nn.init.constant_(self.critic.bias.data, 0) 94 | # init the policy layer... 95 | nn.init.orthogonal_(self.actor.weight.data, gain=0.01) 96 | nn.init.constant_(self.actor.bias.data, 0) 97 | 98 | def forward(self, inputs): 99 | x = self.cnn_layer(inputs / 255.0) 100 | value = self.critic(x) 101 | pi = F.softmax(self.actor(x), dim=1) 102 | return value, pi 103 | -------------------------------------------------------------------------------- /rl_utils/logger/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | import seaborn as sns 4 | from rl_utils.bench import load_results 5 | 6 | sns.set(style="dark") 7 | sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 2}) 8 | sns.set(rc={"figure.figsize": (15, 8)}) 9 | colors = sns.color_palette(palette='muted') 10 | 11 | 12 | X_TIMESTEPS = 'timesteps' 13 | X_EPISODES = 'episodes' 14 | X_WALLTIME = 'walltime_hrs' 15 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] 16 | EPISODES_WINDOW = 150 17 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', 18 | 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', 19 | 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue'] 20 | 21 | def rolling_window(a, window): 22 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 23 | strides = a.strides + (a.strides[-1],) 24 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) 25 | 26 | def window_func(x, y, window, func): 27 | yw = rolling_window(y, window) 28 | yw_func = func(yw, axis=-1) 29 | return x[window-1:], yw_func 30 | 31 | def ts2xy(ts, xaxis): 32 | if xaxis == X_TIMESTEPS: 33 | x = np.cumsum(ts.l.values) 34 | y = ts.r.values 35 | elif xaxis == X_EPISODES: 36 | x = np.arange(len(ts)) 37 | y = ts.r.values 38 | elif xaxis == X_WALLTIME: 39 | x = ts.t.values / 3600. 40 | y = ts.r.values 41 | else: 42 | raise NotImplementedError 43 | return x, y 44 | 45 | def plot_curves(xy_list, xaxis, title, plt_order, beta=False): 46 | maxx = max(xy[0][-1] for xy in xy_list) 47 | minx = 0 48 | if beta == 'dqn': 49 | label = ['DQN'] 50 | elif beta == 'ddqn': 51 | label = ['Double-DQN'] 52 | elif beta == 'dueling': 53 | label = ['Dueling-DQN'] 54 | psub = plt.subplot(plt_order) 55 | plt.tight_layout() 56 | plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0)) 57 | for (i, (x, y)) in enumerate(xy_list): 58 | #plt.scatter(x, y, s=2) 59 | x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes 60 | psub.plot(x, y_mean, label=label[i]) 61 | psub.set_xlim([minx, maxx]) 62 | psub.set_title(title) 63 | psub.legend(loc='best') 64 | psub.set_xlabel(xaxis) 65 | psub.set_ylabel("rewards") 66 | 67 | def plot_results(dirs, num_timesteps, xaxis, task_name, plt_order, beta=False): 68 | tslist = [] 69 | for dir in dirs: 70 | ts = load_results(dir) 71 | ts = ts[ts.l.cumsum() <= num_timesteps] 72 | tslist.append(ts) 73 | xy_list = [ts2xy(ts, xaxis) for ts in tslist] 74 | plot_curves(xy_list, xaxis, task_name, plt_order, beta) 75 | 76 | def main(): 77 | import argparse 78 | import os 79 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 80 | parser.add_argument('--dirs', help='List of log directories', nargs = '*', default='logs_dqn/') 81 | parser.add_argument('--num_timesteps', type=int, default=int(2e7)) 82 | parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS) 83 | parser.add_argument('--task_name', help = 'Title of plot', default = 'BreakoutNoFrameskip-v4') 84 | args = parser.parse_args() 85 | env_name = ['BankHeistNoFrameskip-v4', 'BreakoutNoFrameskip-v4', 'KangarooNoFrameskip-v4', \ 86 | 'PongNoFrameskip-v4', 'SeaquestNoFrameskip-v4', 'SpaceInvadersNoFrameskip-v4'] 87 | dirs = [os.path.abspath(args.dirs + name) for name in env_name] 88 | for idx in range(len(dirs)): 89 | plot_results([dirs[idx]], args.num_timesteps, args.xaxis, env_name[idx], 231+idx, beta='dqn') 90 | double_dirs = [os.path.abspath('logs_ddqn/' + name) for name in env_name] 91 | for idx in range(len(dirs)): 92 | plot_results([double_dirs[idx]], args.num_timesteps, args.xaxis, env_name[idx], 231+idx, beta='ddqn') 93 | dueling_dirs = [os.path.abspath('logs/' + name) for name in env_name] 94 | for idx in range(len(dirs)): 95 | plot_results([dueling_dirs[idx]], args.num_timesteps, args.xaxis, env_name[idx], 231+idx, beta='dueling') 96 | plt.savefig("dueling.png") 97 | 98 | if __name__ == '__main__': 99 | main() 100 | 101 | -------------------------------------------------------------------------------- /rl_utils/env_wrapper/multi_envs_wrapper.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import numpy as np 3 | from rl_utils.env_wrapper import VecEnv, CloudpickleWrapper, clear_mpi_env_vars 4 | 5 | def worker(remote, parent_remote, env_fn_wrapper): 6 | parent_remote.close() 7 | env = env_fn_wrapper.x() 8 | try: 9 | while True: 10 | cmd, data = remote.recv() 11 | if cmd == 'step': 12 | ob, reward, done, info = env.step(data) 13 | if done: 14 | ob = env.reset() 15 | remote.send((ob, reward, done, info)) 16 | elif cmd == 'reset': 17 | ob = env.reset() 18 | remote.send(ob) 19 | elif cmd == 'render': 20 | remote.send(env.render(mode='rgb_array')) 21 | elif cmd == 'close': 22 | remote.close() 23 | break 24 | elif cmd == 'get_spaces_spec': 25 | remote.send((env.observation_space, env.action_space, env.spec)) 26 | else: 27 | raise NotImplementedError 28 | except KeyboardInterrupt: 29 | print('SubprocVecEnv worker: got KeyboardInterrupt') 30 | finally: 31 | env.close() 32 | 33 | 34 | class SubprocVecEnv(VecEnv): 35 | """ 36 | VecEnv that runs multiple environments in parallel in subproceses and communicates with them via pipes. 37 | Recommended to use when num_envs > 1 and step() can be a bottleneck. 38 | """ 39 | def __init__(self, env_fns, spaces=None, context='spawn'): 40 | """ 41 | Arguments: 42 | 43 | env_fns: iterable of callables - functions that create environments to run in subprocesses. Need to be cloud-pickleable 44 | """ 45 | self.waiting = False 46 | self.closed = False 47 | nenvs = len(env_fns) 48 | ctx = mp.get_context(context) 49 | self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(nenvs)]) 50 | self.ps = [ctx.Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 51 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 52 | for p in self.ps: 53 | p.daemon = True # if the main process crashes, we should not cause things to hang 54 | with clear_mpi_env_vars(): 55 | p.start() 56 | for remote in self.work_remotes: 57 | remote.close() 58 | 59 | self.remotes[0].send(('get_spaces_spec', None)) 60 | observation_space, action_space, self.spec = self.remotes[0].recv() 61 | self.viewer = None 62 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 63 | 64 | def step_async(self, actions): 65 | self._assert_not_closed() 66 | for remote, action in zip(self.remotes, actions): 67 | remote.send(('step', action)) 68 | self.waiting = True 69 | 70 | def step_wait(self): 71 | self._assert_not_closed() 72 | results = [remote.recv() for remote in self.remotes] 73 | self.waiting = False 74 | obs, rews, dones, infos = zip(*results) 75 | return _flatten_obs(obs), np.stack(rews), np.stack(dones), infos 76 | 77 | def reset(self): 78 | self._assert_not_closed() 79 | for remote in self.remotes: 80 | remote.send(('reset', None)) 81 | return _flatten_obs([remote.recv() for remote in self.remotes]) 82 | 83 | def close_extras(self): 84 | self.closed = True 85 | if self.waiting: 86 | for remote in self.remotes: 87 | remote.recv() 88 | for remote in self.remotes: 89 | remote.send(('close', None)) 90 | for p in self.ps: 91 | p.join() 92 | 93 | def get_images(self): 94 | self._assert_not_closed() 95 | for pipe in self.remotes: 96 | pipe.send(('render', None)) 97 | imgs = [pipe.recv() for pipe in self.remotes] 98 | return imgs 99 | 100 | def _assert_not_closed(self): 101 | assert not self.closed, "Trying to operate on a SubprocVecEnv after calling close()" 102 | 103 | def __del__(self): 104 | if not self.closed: 105 | self.close() 106 | 107 | def _flatten_obs(obs): 108 | assert isinstance(obs, (list, tuple)) 109 | assert len(obs) > 0 110 | 111 | if isinstance(obs[0], dict): 112 | keys = obs[0].keys() 113 | return {k: np.stack([o[k] for o in obs]) for k in keys} 114 | else: 115 | return np.stack(obs) 116 | -------------------------------------------------------------------------------- /rl_algorithms/dqn_algos/dqn_agent.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | from models import net 4 | from utils import linear_schedule, select_actions, reward_recorder 5 | from rl_utils.experience_replay.experience_replay import replay_buffer 6 | import torch 7 | from datetime import datetime 8 | import os 9 | import copy 10 | 11 | # define the dqn agent 12 | class dqn_agent: 13 | def __init__(self, env, args): 14 | # define some important 15 | self.env = env 16 | self.args = args 17 | # define the network 18 | self.net = net(self.env.action_space.n, self.args.use_dueling) 19 | # copy the self.net as the 20 | self.target_net = copy.deepcopy(self.net) 21 | # make sure the target net has the same weights as the network 22 | self.target_net.load_state_dict(self.net.state_dict()) 23 | if self.args.cuda: 24 | self.net.cuda() 25 | self.target_net.cuda() 26 | # define the optimizer 27 | self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.args.lr) 28 | # define the replay memory 29 | self.buffer = replay_buffer(self.args.buffer_size) 30 | # define the linear schedule of the exploration 31 | self.exploration_schedule = linear_schedule(int(self.args.total_timesteps * self.args.exploration_fraction), \ 32 | self.args.final_ratio, self.args.init_ratio) 33 | # create the folder to save the models 34 | if not os.path.exists(self.args.save_dir): 35 | os.mkdir(self.args.save_dir) 36 | # set the environment folder 37 | self.model_path = os.path.join(self.args.save_dir, self.args.env_name) 38 | if not os.path.exists(self.model_path): 39 | os.mkdir(self.model_path) 40 | 41 | # start to do the training 42 | def learn(self): 43 | # the episode reward 44 | episode_reward = reward_recorder() 45 | obs = np.array(self.env.reset()) 46 | td_loss = 0 47 | for timestep in range(self.args.total_timesteps): 48 | explore_eps = self.exploration_schedule.get_value(timestep) 49 | with torch.no_grad(): 50 | obs_tensor = self._get_tensors(obs) 51 | action_value = self.net(obs_tensor) 52 | # select actions 53 | action = select_actions(action_value, explore_eps) 54 | # excute actions 55 | obs_, reward, done, _ = self.env.step(action) 56 | obs_ = np.array(obs_) 57 | # tryint to append the samples 58 | self.buffer.add(obs, action, reward, obs_, float(done)) 59 | obs = obs_ 60 | # add the rewards 61 | episode_reward.add_rewards(reward) 62 | if done: 63 | obs = np.array(self.env.reset()) 64 | # start new episode to store rewards 65 | episode_reward.start_new_episode() 66 | if timestep > self.args.learning_starts and timestep % self.args.train_freq == 0: 67 | # start to sample the samples from the replay buffer 68 | batch_samples = self.buffer.sample(self.args.batch_size) 69 | td_loss = self._update_network(batch_samples) 70 | if timestep > self.args.learning_starts and timestep % self.args.target_network_update_freq == 0: 71 | # update the target network 72 | self.target_net.load_state_dict(self.net.state_dict()) 73 | if done and episode_reward.num_episodes % self.args.display_interval == 0: 74 | print('[{}] Frames: {}, Episode: {}, Mean: {:.3f}, Loss: {:.3f}'.format(datetime.now(), timestep, episode_reward.num_episodes, \ 75 | episode_reward.mean, td_loss)) 76 | torch.save(self.net.state_dict(), self.model_path + '/model.pt') 77 | 78 | # update the network 79 | def _update_network(self, samples): 80 | obses, actions, rewards, obses_next, dones = samples 81 | # convert the data to tensor 82 | obses = self._get_tensors(obses) 83 | actions = torch.tensor(actions, dtype=torch.int64).unsqueeze(-1) 84 | rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(-1) 85 | obses_next = self._get_tensors(obses_next) 86 | dones = torch.tensor(1 - dones, dtype=torch.float32).unsqueeze(-1) 87 | # convert into gpu 88 | if self.args.cuda: 89 | actions = actions.cuda() 90 | rewards = rewards.cuda() 91 | dones = dones.cuda() 92 | # calculate the target value 93 | with torch.no_grad(): 94 | # if use the double network architecture 95 | if self.args.use_double_net: 96 | q_value_ = self.net(obses_next) 97 | action_max_idx = torch.argmax(q_value_, dim=1, keepdim=True) 98 | target_action_value = self.target_net(obses_next) 99 | target_action_max_value = target_action_value.gather(1, action_max_idx) 100 | else: 101 | target_action_value = self.target_net(obses_next) 102 | target_action_max_value, _ = torch.max(target_action_value, dim=1, keepdim=True) 103 | # target 104 | expected_value = rewards + self.args.gamma * target_action_max_value * dones 105 | # get the real q value 106 | action_value = self.net(obses) 107 | real_value = action_value.gather(1, actions) 108 | loss = (expected_value - real_value).pow(2).mean() 109 | # start to update 110 | self.optimizer.zero_grad() 111 | loss.backward() 112 | self.optimizer.step() 113 | return loss.item() 114 | 115 | # get tensors 116 | def _get_tensors(self, obs): 117 | if obs.ndim == 3: 118 | obs = np.transpose(obs, (2, 0, 1)) 119 | obs = np.expand_dims(obs, 0) 120 | elif obs.ndim == 4: 121 | obs = np.transpose(obs, (0, 3, 1, 2)) 122 | obs = torch.tensor(obs, dtype=torch.float32) 123 | if self.args.cuda: 124 | obs = obs.cuda() 125 | return obs 126 | -------------------------------------------------------------------------------- /rl_utils/logger/bench.py: -------------------------------------------------------------------------------- 1 | __all__ = ['Monitor', 'get_monitor_files', 'load_results'] 2 | 3 | from gym.core import Wrapper 4 | import time 5 | from glob import glob 6 | import csv 7 | import os.path as osp 8 | import json 9 | 10 | class Monitor(Wrapper): 11 | EXT = "monitor.csv" 12 | f = None 13 | 14 | def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()): 15 | Wrapper.__init__(self, env=env) 16 | self.tstart = time.time() 17 | if filename: 18 | self.results_writer = ResultsWriter(filename, 19 | header={"t_start": time.time(), 'env_id' : env.spec and env.spec.id}, 20 | extra_keys=reset_keywords + info_keywords 21 | ) 22 | else: 23 | self.results_writer = None 24 | self.reset_keywords = reset_keywords 25 | self.info_keywords = info_keywords 26 | self.allow_early_resets = allow_early_resets 27 | self.rewards = None 28 | self.needs_reset = True 29 | self.episode_rewards = [] 30 | self.episode_lengths = [] 31 | self.episode_times = [] 32 | self.total_steps = 0 33 | self.current_reset_info = {} # extra info about the current episode, that was passed in during reset() 34 | 35 | def reset(self, **kwargs): 36 | self.reset_state() 37 | for k in self.reset_keywords: 38 | v = kwargs.get(k) 39 | if v is None: 40 | raise ValueError('Expected you to pass kwarg %s into reset'%k) 41 | self.current_reset_info[k] = v 42 | return self.env.reset(**kwargs) 43 | 44 | def reset_state(self): 45 | if not self.allow_early_resets and not self.needs_reset: 46 | raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)") 47 | self.rewards = [] 48 | self.needs_reset = False 49 | 50 | 51 | def step(self, action): 52 | if self.needs_reset: 53 | raise RuntimeError("Tried to step environment that needs reset") 54 | ob, rew, done, info = self.env.step(action) 55 | self.update(ob, rew, done, info) 56 | return (ob, rew, done, info) 57 | 58 | def update(self, ob, rew, done, info): 59 | self.rewards.append(rew) 60 | if done: 61 | self.needs_reset = True 62 | eprew = sum(self.rewards) 63 | eplen = len(self.rewards) 64 | epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)} 65 | for k in self.info_keywords: 66 | epinfo[k] = info[k] 67 | self.episode_rewards.append(eprew) 68 | self.episode_lengths.append(eplen) 69 | self.episode_times.append(time.time() - self.tstart) 70 | epinfo.update(self.current_reset_info) 71 | if self.results_writer: 72 | self.results_writer.write_row(epinfo) 73 | assert isinstance(info, dict) 74 | if isinstance(info, dict): 75 | info['episode'] = epinfo 76 | 77 | self.total_steps += 1 78 | 79 | def close(self): 80 | if self.f is not None: 81 | self.f.close() 82 | 83 | def get_total_steps(self): 84 | return self.total_steps 85 | 86 | def get_episode_rewards(self): 87 | return self.episode_rewards 88 | 89 | def get_episode_lengths(self): 90 | return self.episode_lengths 91 | 92 | def get_episode_times(self): 93 | return self.episode_times 94 | 95 | class LoadMonitorResultsError(Exception): 96 | pass 97 | 98 | 99 | class ResultsWriter(object): 100 | def __init__(self, filename, header='', extra_keys=()): 101 | self.extra_keys = extra_keys 102 | assert filename is not None 103 | if not filename.endswith(Monitor.EXT): 104 | if osp.isdir(filename): 105 | filename = osp.join(filename, Monitor.EXT) 106 | else: 107 | filename = filename + "." + Monitor.EXT 108 | self.f = open(filename, "wt") 109 | if isinstance(header, dict): 110 | header = '# {} \n'.format(json.dumps(header)) 111 | self.f.write(header) 112 | self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+tuple(extra_keys)) 113 | self.logger.writeheader() 114 | self.f.flush() 115 | 116 | def write_row(self, epinfo): 117 | if self.logger: 118 | self.logger.writerow(epinfo) 119 | self.f.flush() 120 | 121 | 122 | def get_monitor_files(dir): 123 | return glob(osp.join(dir, "*" + Monitor.EXT)) 124 | 125 | def load_results(dir): 126 | import pandas 127 | monitor_files = ( 128 | glob(osp.join(dir, "*monitor.json")) + 129 | glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files 130 | if not monitor_files: 131 | raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir)) 132 | dfs = [] 133 | headers = [] 134 | for fname in monitor_files: 135 | with open(fname, 'rt') as fh: 136 | if fname.endswith('csv'): 137 | firstline = fh.readline() 138 | if not firstline: 139 | continue 140 | assert firstline[0] == '#' 141 | header = json.loads(firstline[1:]) 142 | df = pandas.read_csv(fh, index_col=None) 143 | headers.append(header) 144 | elif fname.endswith('json'): # Deprecated json format 145 | episodes = [] 146 | lines = fh.readlines() 147 | header = json.loads(lines[0]) 148 | headers.append(header) 149 | for line in lines[1:]: 150 | episode = json.loads(line) 151 | episodes.append(episode) 152 | df = pandas.DataFrame(episodes) 153 | else: 154 | assert 0, 'unreachable' 155 | df['t'] += header['t_start'] 156 | dfs.append(df) 157 | df = pandas.concat(dfs) 158 | df.sort_values('t', inplace=True) 159 | df.reset_index(inplace=True) 160 | df['t'] -= min(header['t_start'] for header in headers) 161 | df.headers = headers # HACK to preserve backwards compatibility 162 | return df 163 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Reinforcement Learning Algorithms 2 | ![logo](figures/logo.png) 3 |   4 | ![MIT License](https://img.shields.io/badge/license-MIT-blue.svg) 5 | This repository will implement the classic deep reinforcement learning algorithms by using **PyTorch**. The aim of this repository is to provide clear code for people to learn the deep reinforcemen learning algorithms. In the future, more algorithms will be added and the existing codes will also be maintained. 6 | ## Current Implementations 7 | - [x] Deep Q-Learning Network (DQN) 8 | - [x] Basic DQN 9 | - [x] Double Q network 10 | - [x] Dueling Network Archtiecure 11 | - [x] Deep Deterministic Policy Gradient (DDPG) 12 | - [x] Advantage Actor-Critic (A2C) 13 | - [x] Trust Region Policy Gradient (TRPO) 14 | - [x] Proximal Policy Optimization (PPO) 15 | - [ ] Actor Critic using Kronecker-Factored Trust Region (ACKTR) 16 | - [x] Soft Actor-Critic (SAC) 17 | ## Update Info 18 | :triangular_flag_on_post: **2018-10-17** - In this update, most of algorithms have been imporved and **add more experiments with plots** (except for DPPG). The **PPO** now supports **atari-games** and **mujoco-env**. The **TRPO** is much stable and can have better results! 19 |   20 | :triangular_flag_on_post: **2019-07-15** - In this update, the installation for the openai baseline is no longer needed. I have intergated useful functions in the **rl__utils** module. DDPG is also re-implemented and support more results. README file has been modified. The code structure also has tiny adjustment. 21 |   22 | :triangular_flag_on_post: **2019-07-26** - In this update, the revised repository will be public. In order to have a light size of the repository. I **rebuild** the repository and the previous version is deleted. But I will make a backup in the google driver. 23 |   24 | :triangular_flag_on_post: **2019-11-13** - Change the code structure of the repo, all algorithms have been moved to `rl_algorithms/` folder. Add soft actor critic method, the expriments plots will be added soon. 25 | ## TODO List 26 | - [ ] add prioritized experience replay. 27 | - [x] in the future, we will not use openai baseline's pre-processing functions. 28 | - [x] improve the **DDPG** - I have already implemented a pytorch Hindsight Experience Replay (HER) with DDPG, you chould check them [here](https://github.com/TianhongDai/hindsight-experience-replay). 29 | - [ ] update pre-trained models in google driver (will update soon!). 30 | ## Requirments 31 | - pytorch=1.0.1 32 | - gym=0.12.5 33 | - mpi4py 34 | - mujoco-py 35 | - opencv-python 36 | - cloudpickle 37 | ## Installation 38 | 1. Install our `rl_utils` module: 39 | ```bash 40 | pip install -e . 41 | ``` 42 | 2. Install mujoco: please follow the instruction of [official website](https://github.com/openai/mujoco-py). 43 | 3. Install Atari and Box2d: 44 | ```bash 45 | sudo apt-get install swig or brew install swig 46 | pip install gym[atari] 47 | pip install gym[box2d] 48 | pip install box2d box2d-kengz 49 | ``` 50 | ## Instruction 51 | 1. Train the agent (details could be found in each folder): 52 | ``` 53 | cd rl_algorithms// 54 | python train.py -- 55 | ``` 56 | 2. Play the demo: 57 | ``` 58 | cd rl_algorithms// 59 | python demo.py -- 60 | ``` 61 | ## Code Structures 62 | 1. **rl algorithms**: 63 | - `arguments.py`: contain the parameters used in the training. 64 | - `_agent.py`: contain the most important part of the reinforcement learning algorithms. 65 | - `models.py`: the network structure for the policy and value function. 66 | - `utils.py`: some useful function, such as **select actions**. 67 | - `train.py`: the script to train the agent. 68 | - `demo.py`: visualize the trained agent. 69 | 2. **rl_utils** module: 70 | - `env_wrapper/`: contain the pre-processing function for the atari games and wrapper to create environments. 71 | - `experience_replay/`: contain the experience replay for the off-policy rl algorithms. 72 | - `logger/`: contain functions to take down log infos during training. 73 | - `mpi_utils/`: contain the tools for the mpi training. 74 | - `running_filter/`: contain the running mean filter functions to normalize the observation in the mujoco environments. 75 | - `seeds/`: contain function to setup the random seeds for the training for reproducibility. 76 | ## Example Results 77 | ### 1. DQN algorithms 78 | ![dqn_performance](figures/01_dqn.png) 79 | ### 2. DDPG 80 | ![dueling_network](figures/02_ddpg.png) 81 | ### 3. A2C 82 | ![a2c](figures/03_a2c.png) 83 | ### 4. TRPO 84 | ![trpo](figures/04_trpo.png) 85 | ### 5. PPO 86 | ![ppo](figures/05_ppo.png) 87 | ### 6. SAC 88 | ![sac](figures/06_sac.png) 89 | 90 | ## Demos 91 | Atari Env (BreakoutNoFrameskip-v4)| Box2d Env (BipedalWalker-v2)| Mujoco Env (Hopper-v2) 92 | -----------------------|-----------------------|-----------------------| 93 | ![](figures/breakout.gif)| ![](figures/bipedal.gif)| ![](figures/hopper.gif) 94 | ## Acknowledgement 95 | - [Ilya Kostrikov's GitHub](https://github.com/ikostrikov) 96 | - [Openai Baselines](https://github.com/openai/baselines) 97 | - [Kai's suggestions to simplify MPI functions](https://github.com/Kaixhin) 98 | - [rlkit](https://github.com/vitchyr/rlkit) 99 | 100 | ## Related Papers 101 | [1] [A Brief Survey of Deep Reinforcement Learning](https://arxiv.org/abs/1708.05866) 102 | [2] [The Beta Policy for Continuous Control Reinforcement Learning](https://www.ri.cmu.edu/wp-content/uploads/2017/06/thesis-Chou.pdf) 103 | [3] [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) 104 | [4] [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461) 105 | [5] [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581) 106 | [6] [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971) 107 | [7] [Continuous Deep Q-Learning with Model-based Acceleration](https://arxiv.org/abs/1603.00748) 108 | [8] [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/abs/1602.01783) 109 | [9] [Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477) 110 | [10] [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347) 111 | [11] [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905) 112 | [12] [Scalable trust-region method for deep reinforcement learning using Kronecker-factored approximation](https://arxiv.org/abs/1708.05144) 113 | -------------------------------------------------------------------------------- /rl_algorithms/a2c/a2c_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from models import net 4 | from datetime import datetime 5 | from utils import select_actions, evaluate_actions, discount_with_dones 6 | import os 7 | 8 | class a2c_agent: 9 | def __init__(self, envs, args): 10 | self.envs = envs 11 | self.args = args 12 | # define the network 13 | self.net = net(self.envs.action_space.n) 14 | if self.args.cuda: 15 | self.net.cuda() 16 | # define the optimizer 17 | self.optimizer = torch.optim.RMSprop(self.net.parameters(), lr=self.args.lr, eps=self.args.eps, alpha=self.args.alpha) 18 | if not os.path.exists(self.args.save_dir): 19 | os.mkdir(self.args.save_dir) 20 | # check the saved path for envs.. 21 | self.model_path = self.args.save_dir + self.args.env_name + '/' 22 | if not os.path.exists(self.model_path): 23 | os.mkdir(self.model_path) 24 | # get the obs.. 25 | self.batch_ob_shape = (self.args.num_workers * self.args.nsteps,) + self.envs.observation_space.shape 26 | self.obs = np.zeros((self.args.num_workers,) + self.envs.observation_space.shape, dtype=self.envs.observation_space.dtype.name) 27 | self.obs[:] = self.envs.reset() 28 | self.dones = [False for _ in range(self.args.num_workers)] 29 | 30 | # train the network.. 31 | def learn(self): 32 | num_updates = self.args.total_frames // (self.args.num_workers * self.args.nsteps) 33 | # get the reward to calculate other information 34 | episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) 35 | final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) 36 | # start to update 37 | for update in range(num_updates): 38 | mb_obs, mb_rewards, mb_actions, mb_dones = [],[],[],[] 39 | for step in range(self.args.nsteps): 40 | with torch.no_grad(): 41 | input_tensor = self._get_tensors(self.obs) 42 | _, pi = self.net(input_tensor) 43 | # select actions 44 | actions = select_actions(pi) 45 | cpu_actions = actions.squeeze(1).cpu().numpy() 46 | # start to store the information 47 | mb_obs.append(np.copy(self.obs)) 48 | mb_actions.append(cpu_actions) 49 | mb_dones.append(self.dones) 50 | # step 51 | obs, rewards, dones, _ = self.envs.step(cpu_actions) 52 | # start to store the rewards 53 | self.dones = dones 54 | mb_rewards.append(rewards) 55 | for n, done in enumerate(dones): 56 | if done: 57 | self.obs[n] = self.obs[n]*0 58 | self.obs = obs 59 | episode_rewards += rewards 60 | # get the masks 61 | masks = np.array([0.0 if done else 1.0 for done in dones], dtype=np.float32) 62 | final_rewards *= masks 63 | final_rewards += (1 - masks) * episode_rewards 64 | episode_rewards *= masks 65 | # update the obs 66 | mb_dones.append(self.dones) 67 | # process the rollouts 68 | mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) 69 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) 70 | mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) 71 | mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) 72 | mb_masks = mb_dones[:, :-1] 73 | mb_dones = mb_dones[:, 1:] 74 | # calculate the last value 75 | with torch.no_grad(): 76 | input_tensor = self._get_tensors(self.obs) 77 | last_values, _ = self.net(input_tensor) 78 | # compute returns 79 | for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values.detach().cpu().numpy().squeeze())): 80 | rewards = rewards.tolist() 81 | dones = dones.tolist() 82 | if dones[-1] == 0: 83 | rewards = discount_with_dones(rewards+[value], dones+[0], self.args.gamma)[:-1] 84 | else: 85 | rewards = discount_with_dones(rewards, dones, self.args.gamma) 86 | mb_rewards[n] = rewards 87 | mb_rewards = mb_rewards.flatten() 88 | mb_actions = mb_actions.flatten() 89 | # start to update network 90 | vl, al, ent = self._update_network(mb_obs, mb_rewards, mb_actions) 91 | if update % self.args.log_interval == 0: 92 | print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.1f}, VL: {:.3f}, PL: {:.3f}, Ent: {:.2f}, Min: {}, Max:{}'.format(\ 93 | datetime.now(), update, num_updates, (update+1)*(self.args.num_workers * self.args.nsteps),\ 94 | final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max())) 95 | torch.save(self.net.state_dict(), self.model_path + 'model.pt') 96 | 97 | # update_network 98 | def _update_network(self, obs, returns, actions): 99 | # evaluate the actions 100 | input_tensor = self._get_tensors(obs) 101 | values, pi = self.net(input_tensor) 102 | # define the tensor of actions, returns 103 | returns = torch.tensor(returns, dtype=torch.float32).unsqueeze(1) 104 | actions = torch.tensor(actions, dtype=torch.int64).unsqueeze(1) 105 | if self.args.cuda: 106 | returns = returns.cuda() 107 | actions = actions.cuda() 108 | # evaluate actions 109 | action_log_probs, dist_entropy = evaluate_actions(pi, actions) 110 | # calculate advantages... 111 | advantages = returns - values 112 | # get the value loss 113 | value_loss = advantages.pow(2).mean() 114 | # get the action loss 115 | action_loss = -(advantages.detach() * action_log_probs).mean() 116 | # total loss 117 | total_loss = action_loss + self.args.value_loss_coef * value_loss - self.args.entropy_coef * dist_entropy 118 | # start to update 119 | self.optimizer.zero_grad() 120 | total_loss.backward() 121 | torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.args.max_grad_norm) 122 | self.optimizer.step() 123 | return value_loss.item(), action_loss.item(), dist_entropy.item() 124 | 125 | # get the tensors... 126 | def _get_tensors(self, obs): 127 | input_tensor = torch.tensor(np.transpose(obs, (0, 3, 1, 2)), dtype=torch.float32) 128 | if self.args.cuda: 129 | input_tensor = input_tensor.cuda() 130 | return input_tensor 131 | -------------------------------------------------------------------------------- /rl_utils/env_wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | import contextlib 4 | 5 | class AlreadySteppingError(Exception): 6 | """ 7 | Raised when an asynchronous step is running while 8 | step_async() is called again. 9 | """ 10 | 11 | def __init__(self): 12 | msg = 'already running an async step' 13 | Exception.__init__(self, msg) 14 | 15 | 16 | class NotSteppingError(Exception): 17 | """ 18 | Raised when an asynchronous step is not running but 19 | step_wait() is called. 20 | """ 21 | 22 | def __init__(self): 23 | msg = 'not running an async step' 24 | Exception.__init__(self, msg) 25 | 26 | 27 | class VecEnv(ABC): 28 | """ 29 | An abstract asynchronous, vectorized environment. 30 | Used to batch data from multiple copies of an environment, so that 31 | each observation becomes an batch of observations, and expected action is a batch of actions to 32 | be applied per-environment. 33 | """ 34 | closed = False 35 | viewer = None 36 | 37 | metadata = { 38 | 'render.modes': ['human', 'rgb_array'] 39 | } 40 | 41 | def __init__(self, num_envs, observation_space, action_space): 42 | self.num_envs = num_envs 43 | self.observation_space = observation_space 44 | self.action_space = action_space 45 | 46 | @abstractmethod 47 | def reset(self): 48 | """ 49 | Reset all the environments and return an array of 50 | observations, or a dict of observation arrays. 51 | 52 | If step_async is still doing work, that work will 53 | be cancelled and step_wait() should not be called 54 | until step_async() is invoked again. 55 | """ 56 | pass 57 | 58 | @abstractmethod 59 | def step_async(self, actions): 60 | """ 61 | Tell all the environments to start taking a step 62 | with the given actions. 63 | Call step_wait() to get the results of the step. 64 | 65 | You should not call this if a step_async run is 66 | already pending. 67 | """ 68 | pass 69 | 70 | @abstractmethod 71 | def step_wait(self): 72 | """ 73 | Wait for the step taken with step_async(). 74 | 75 | Returns (obs, rews, dones, infos): 76 | - obs: an array of observations, or a dict of 77 | arrays of observations. 78 | - rews: an array of rewards 79 | - dones: an array of "episode done" booleans 80 | - infos: a sequence of info objects 81 | """ 82 | pass 83 | 84 | def close_extras(self): 85 | """ 86 | Clean up the extra resources, beyond what's in this base class. 87 | Only runs when not self.closed. 88 | """ 89 | pass 90 | 91 | def close(self): 92 | if self.closed: 93 | return 94 | if self.viewer is not None: 95 | self.viewer.close() 96 | self.close_extras() 97 | self.closed = True 98 | 99 | def step(self, actions): 100 | """ 101 | Step the environments synchronously. 102 | 103 | This is available for backwards compatibility. 104 | """ 105 | self.step_async(actions) 106 | return self.step_wait() 107 | 108 | def render(self, mode='human'): 109 | raise NotImplementedError 110 | 111 | def get_images(self): 112 | """ 113 | Return RGB images from each environment 114 | """ 115 | raise NotImplementedError 116 | 117 | @property 118 | def unwrapped(self): 119 | if isinstance(self, VecEnvWrapper): 120 | return self.venv.unwrapped 121 | else: 122 | return self 123 | 124 | def get_viewer(self): 125 | if self.viewer is None: 126 | from gym.envs.classic_control import rendering 127 | self.viewer = rendering.SimpleImageViewer() 128 | return self.viewer 129 | 130 | class VecEnvWrapper(VecEnv): 131 | """ 132 | An environment wrapper that applies to an entire batch 133 | of environments at once. 134 | """ 135 | 136 | def __init__(self, venv, observation_space=None, action_space=None): 137 | self.venv = venv 138 | super().__init__(num_envs=venv.num_envs, 139 | observation_space=observation_space or venv.observation_space, 140 | action_space=action_space or venv.action_space) 141 | 142 | def step_async(self, actions): 143 | self.venv.step_async(actions) 144 | 145 | @abstractmethod 146 | def reset(self): 147 | pass 148 | 149 | @abstractmethod 150 | def step_wait(self): 151 | pass 152 | 153 | def close(self): 154 | return self.venv.close() 155 | 156 | def render(self, mode='human'): 157 | return self.venv.render(mode=mode) 158 | 159 | def get_images(self): 160 | return self.venv.get_images() 161 | 162 | def __getattr__(self, name): 163 | if name.startswith('_'): 164 | raise AttributeError("attempted to get missing private attribute '{}'".format(name)) 165 | return getattr(self.venv, name) 166 | 167 | class VecEnvObservationWrapper(VecEnvWrapper): 168 | @abstractmethod 169 | def process(self, obs): 170 | pass 171 | 172 | def reset(self): 173 | obs = self.venv.reset() 174 | return self.process(obs) 175 | 176 | def step_wait(self): 177 | obs, rews, dones, infos = self.venv.step_wait() 178 | return self.process(obs), rews, dones, infos 179 | 180 | class CloudpickleWrapper(object): 181 | """ 182 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) 183 | """ 184 | 185 | def __init__(self, x): 186 | self.x = x 187 | 188 | def __getstate__(self): 189 | import cloudpickle 190 | return cloudpickle.dumps(self.x) 191 | 192 | def __setstate__(self, ob): 193 | import pickle 194 | self.x = pickle.loads(ob) 195 | 196 | @contextlib.contextmanager 197 | def clear_mpi_env_vars(): 198 | """ 199 | from mpi4py import MPI will call MPI_Init by default. If the child process has MPI environment variables, MPI will think that the child process is an MPI process just like the parent and do bad things such as hang. 200 | This context manager is a hacky way to clear those environment variables temporarily such as when we are starting multiprocessing 201 | Processes. 202 | """ 203 | removed_environment = {} 204 | for k, v in list(os.environ.items()): 205 | for prefix in ['OMPI_', 'PMI_']: 206 | if k.startswith(prefix): 207 | removed_environment[k] = v 208 | del os.environ[k] 209 | try: 210 | yield 211 | finally: 212 | os.environ.update(removed_environment) 213 | -------------------------------------------------------------------------------- /rl_algorithms/ddpg/ddpg_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from models import actor, critic 3 | import torch 4 | import os 5 | from datetime import datetime 6 | from mpi4py import MPI 7 | from rl_utils.mpi_utils.normalizer import normalizer 8 | from rl_utils.mpi_utils.utils import sync_networks, sync_grads 9 | from rl_utils.experience_replay.experience_replay import replay_buffer 10 | from utils import ounoise 11 | import copy 12 | import gym 13 | 14 | """ 15 | ddpg algorithms - revised baseline version 16 | 17 | support MPI training 18 | 19 | """ 20 | 21 | class ddpg_agent: 22 | def __init__(self, env, args): 23 | self.env = env 24 | self.args = args 25 | # get the dims and action max of the environment 26 | obs_dims = self.env.observation_space.shape[0] 27 | self.action_dims = self.env.action_space.shape[0] 28 | self.action_max = self.env.action_space.high[0] 29 | # define the network 30 | self.actor_net = actor(obs_dims, self.action_dims) 31 | self.critic_net = critic(obs_dims, self.action_dims) 32 | # sync the weights across the mpi 33 | sync_networks(self.actor_net) 34 | sync_networks(self.critic_net) 35 | # build the target newtork 36 | self.actor_target_net = copy.deepcopy(self.actor_net) 37 | self.critic_target_net = copy.deepcopy(self.critic_net) 38 | # create the optimizer 39 | self.actor_optim = torch.optim.Adam(self.actor_net.parameters(), self.args.lr_actor) 40 | self.critic_optim = torch.optim.Adam(self.critic_net.parameters(), self.args.lr_critic, weight_decay=self.args.critic_l2_reg) 41 | # create the replay buffer 42 | self.replay_buffer = replay_buffer(self.args.replay_size) 43 | # create the normalizer 44 | self.o_norm = normalizer(obs_dims, default_clip_range=self.args.clip_range) 45 | # create the noise generator 46 | self.noise_generator = ounoise(std=0.2, action_dim=self.action_dims) 47 | # create the dir to save models 48 | if MPI.COMM_WORLD.Get_rank() == 0: 49 | if not os.path.exists(self.args.save_dir): 50 | os.mkdir(self.args.save_dir) 51 | self.model_path = os.path.join(self.args.save_dir, self.args.env_name) 52 | if not os.path.exists(self.model_path): 53 | os.mkdir(self.model_path) 54 | # create a eval environemnt 55 | self.eval_env = gym.make(self.args.env_name) 56 | # set seeds 57 | self.eval_env.seed(self.args.seed * 2 + MPI.COMM_WORLD.Get_rank()) 58 | 59 | def learn(self): 60 | """ 61 | the learning part 62 | 63 | """ 64 | self.actor_net.train() 65 | # reset the environmenr firstly 66 | obs = self.env.reset() 67 | self.noise_generator.reset() 68 | # get the number of epochs 69 | nb_epochs = self.args.total_frames // (self.args.nb_rollout_steps * self.args.nb_cycles) 70 | for epoch in range(nb_epochs): 71 | for _ in range(self.args.nb_cycles): 72 | # used to update the normalizer 73 | ep_obs = [] 74 | for _ in range(self.args.nb_rollout_steps): 75 | with torch.no_grad(): 76 | inputs_tensor = self._preproc_inputs(obs) 77 | pi = self.actor_net(inputs_tensor) 78 | action = self._select_actions(pi) 79 | # feed actions into the environment 80 | obs_, reward, done, _ = self.env.step(self.action_max * action) 81 | # append the rollout information into the memory 82 | self.replay_buffer.add(obs, action, reward, obs_, float(done)) 83 | ep_obs.append(obs.copy()) 84 | obs = obs_ 85 | # if done, reset the environment 86 | if done: 87 | obs = self.env.reset() 88 | self.noise_generator.reset() 89 | # then start to do the update of the normalizer 90 | ep_obs = np.array(ep_obs) 91 | self.o_norm.update(ep_obs) 92 | self.o_norm.recompute_stats() 93 | # then start to update the network 94 | for _ in range(self.args.nb_train): 95 | a_loss, c_loss = self._update_network() 96 | # update the target network 97 | self._soft_update_target_network(self.actor_target_net, self.actor_net) 98 | self._soft_update_target_network(self.critic_target_net, self.critic_net) 99 | # start to do the evaluation 100 | success_rate = self._eval_agent() 101 | # convert back to normal 102 | self.actor_net.train() 103 | if epoch % self.args.display_interval == 0: 104 | if MPI.COMM_WORLD.Get_rank() == 0: 105 | print('[{}] Epoch: {} / {}, Frames: {}, Rewards: {:.3f}, Actor loss: {:.3f}, Critic Loss: {:.3f}'.format(datetime.now(), \ 106 | epoch, nb_epochs, (epoch+1) * self.args.nb_rollout_steps * self.args.nb_cycles, success_rate, a_loss, c_loss)) 107 | torch.save([self.actor_net.state_dict(), self.o_norm.mean, self.o_norm.std], self.model_path + '/model.pt') 108 | 109 | # functions to preprocess the image 110 | def _preproc_inputs(self, obs): 111 | obs_norm = self.o_norm.normalize(obs) 112 | inputs_tensor = torch.tensor(obs_norm, dtype=torch.float32).unsqueeze(0) 113 | return inputs_tensor 114 | 115 | # this function will choose action for the agent and do the exploration 116 | def _select_actions(self, pi): 117 | action = pi.cpu().numpy().squeeze() 118 | # TODO: Noise type now - only support ounoise 119 | # add the gaussian noise 120 | #action = action + np.random.normal(0, 0.1, self.action_dims) 121 | # add ou noise 122 | action = action + self.noise_generator.noise() 123 | action = np.clip(action, -1, 1) 124 | return action 125 | 126 | # update the network 127 | def _update_network(self): 128 | # sample the samples from the replay buffer 129 | samples = self.replay_buffer.sample(self.args.batch_size) 130 | obses, actions, rewards, obses_next, dones = samples 131 | # try to do the normalization of obses 132 | norm_obses = self.o_norm.normalize(obses) 133 | norm_obses_next = self.o_norm.normalize(obses_next) 134 | # transfer them into tensors 135 | norm_obses_tensor = torch.tensor(norm_obses, dtype=torch.float32) 136 | norm_obses_next_tensor = torch.tensor(norm_obses_next, dtype=torch.float32) 137 | actions_tensor = torch.tensor(actions, dtype=torch.float32) 138 | rewards_tensor = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1) 139 | dones_tensor = torch.tensor(dones, dtype=torch.float32).unsqueeze(1) 140 | with torch.no_grad(): 141 | actions_next = self.actor_target_net(norm_obses_next_tensor) 142 | q_next_value = self.critic_target_net(norm_obses_next_tensor, actions_next) 143 | target_q_value = rewards_tensor + (1 - dones_tensor) * self.args.gamma * q_next_value 144 | # the real q value 145 | real_q_value = self.critic_net(norm_obses_tensor, actions_tensor) 146 | critic_loss = (real_q_value - target_q_value).pow(2).mean() 147 | # the actor loss 148 | actions_real = self.actor_net(norm_obses_tensor) 149 | actor_loss = -self.critic_net(norm_obses_tensor, actions_real).mean() 150 | # start to update the network 151 | self.actor_optim.zero_grad() 152 | actor_loss.backward() 153 | sync_grads(self.actor_net) 154 | self.actor_optim.step() 155 | # update the critic network 156 | self.critic_optim.zero_grad() 157 | critic_loss.backward() 158 | sync_grads(self.critic_net) 159 | self.critic_optim.step() 160 | return actor_loss.item(), critic_loss.item() 161 | 162 | # soft update the target network... 163 | def _soft_update_target_network(self, target, source): 164 | for target_param, param in zip(target.parameters(), source.parameters()): 165 | target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data) 166 | 167 | # do the evaluation 168 | def _eval_agent(self): 169 | self.actor_net.eval() 170 | total_success_rate = [] 171 | for _ in range(self.args.nb_test_rollouts): 172 | per_success_rate = [] 173 | obs = self.eval_env.reset() 174 | while True: 175 | with torch.no_grad(): 176 | inputs_tensor = self._preproc_inputs(obs) 177 | pi = self.actor_net(inputs_tensor) 178 | actions = pi.detach().cpu().numpy().squeeze() 179 | if self.action_dims == 1: 180 | actions = np.array([actions]) 181 | obs_, reward, done, _ = self.eval_env.step(actions * self.action_max) 182 | per_success_rate.append(reward) 183 | obs = obs_ 184 | if done: 185 | break 186 | total_success_rate.append(np.sum(per_success_rate)) 187 | local_success_rate = np.mean(total_success_rate) 188 | global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM) 189 | return global_success_rate / MPI.COMM_WORLD.Get_size() 190 | -------------------------------------------------------------------------------- /rl_algorithms/trpo/trpo_agent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import os 4 | from models import network 5 | from rl_utils.running_filter.running_filter import ZFilter 6 | from utils import select_actions, eval_actions, conjugated_gradient, line_search, set_flat_params_to 7 | from datetime import datetime 8 | 9 | class trpo_agent: 10 | def __init__(self, env, args): 11 | self.env = env 12 | self.args = args 13 | # define the network 14 | self.net = network(self.env.observation_space.shape[0], self.env.action_space.shape[0]) 15 | self.old_net = network(self.env.observation_space.shape[0], self.env.action_space.shape[0]) 16 | # make sure the net and old net have the same parameters 17 | self.old_net.load_state_dict(self.net.state_dict()) 18 | # define the optimizer 19 | self.optimizer = torch.optim.Adam(self.net.critic.parameters(), lr=self.args.lr) 20 | # define the running mean filter 21 | self.running_state = ZFilter((self.env.observation_space.shape[0],), clip=5) 22 | if not os.path.exists(self.args.save_dir): 23 | os.mkdir(self.args.save_dir) 24 | self.model_path = self.args.save_dir + self.args.env_name + '/' 25 | if not os.path.exists(self.model_path): 26 | os.mkdir(self.model_path) 27 | 28 | def learn(self): 29 | num_updates = self.args.total_timesteps // self.args.nsteps 30 | obs = self.running_state(self.env.reset()) 31 | final_reward = 0 32 | episode_reward = 0 33 | self.dones = False 34 | for update in range(num_updates): 35 | mb_obs, mb_rewards, mb_actions, mb_dones, mb_values = [], [], [], [], [] 36 | for step in range(self.args.nsteps): 37 | with torch.no_grad(): 38 | obs_tensor = self._get_tensors(obs) 39 | value, pi = self.net(obs_tensor) 40 | # select actions 41 | actions = select_actions(pi) 42 | # store informations 43 | mb_obs.append(np.copy(obs)) 44 | mb_actions.append(actions) 45 | mb_dones.append(self.dones) 46 | mb_values.append(value.detach().numpy().squeeze()) 47 | # start to execute actions in the environment 48 | obs_, reward, done, _ = self.env.step(actions) 49 | self.dones = done 50 | mb_rewards.append(reward) 51 | if done: 52 | obs_ = self.env.reset() 53 | obs = self.running_state(obs_) 54 | episode_reward += reward 55 | mask = 0.0 if done else 1.0 56 | final_reward *= mask 57 | final_reward += (1 - mask) * episode_reward 58 | episode_reward *= mask 59 | # to process the rollouts 60 | mb_obs = np.asarray(mb_obs, dtype=np.float32) 61 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32) 62 | mb_actions = np.asarray(mb_actions, dtype=np.float32) 63 | mb_dones = np.asarray(mb_dones, dtype=np.bool) 64 | mb_values = np.asarray(mb_values, dtype=np.float32) 65 | # compute the last state value 66 | with torch.no_grad(): 67 | obs_tensor = self._get_tensors(obs) 68 | last_value, _ = self.net(obs_tensor) 69 | last_value = last_value.detach().numpy().squeeze() 70 | # compute the advantages 71 | mb_returns = np.zeros_like(mb_rewards) 72 | mb_advs = np.zeros_like(mb_rewards) 73 | lastgaelam = 0 74 | for t in reversed(range(self.args.nsteps)): 75 | if t == self.args.nsteps - 1: 76 | nextnonterminal = 1.0 - self.dones 77 | nextvalues = last_value 78 | else: 79 | nextnonterminal = 1.0 - mb_dones[t + 1] 80 | nextvalues = mb_values[t + 1] 81 | delta = mb_rewards[t] + self.args.gamma * nextvalues * nextnonterminal - mb_values[t] 82 | mb_advs[t] = lastgaelam = delta + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam 83 | mb_returns = mb_advs + mb_values 84 | # normalize the advantages 85 | mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-5) 86 | # before the update, make the old network has the parameter of the current network 87 | self.old_net.load_state_dict(self.net.state_dict()) 88 | # start to update the network 89 | policy_loss, value_loss = self._update_network(mb_obs, mb_actions, mb_returns, mb_advs) 90 | torch.save([self.net.state_dict(), self.running_state], self.model_path + 'model.pt') 91 | print('[{}] Update: {} / {}, Frames: {}, Reward: {:.3f}, VL: {:.3f}, PL: {:.3f}'.format(datetime.now(), update, \ 92 | num_updates, (update + 1)*self.args.nsteps, final_reward, value_loss, policy_loss)) 93 | 94 | # start to update network 95 | def _update_network(self, mb_obs, mb_actions, mb_returns, mb_advs): 96 | mb_obs_tensor = torch.tensor(mb_obs, dtype=torch.float32) 97 | mb_actions_tensor = torch.tensor(mb_actions, dtype=torch.float32) 98 | mb_returns_tensor = torch.tensor(mb_returns, dtype=torch.float32).unsqueeze(1) 99 | mb_advs_tensor = torch.tensor(mb_advs, dtype=torch.float32).unsqueeze(1) 100 | # try to get the old policy and current policy 101 | values, _ = self.net(mb_obs_tensor) 102 | with torch.no_grad(): 103 | _, pi_old = self.old_net(mb_obs_tensor) 104 | # get the surr loss 105 | surr_loss = self._get_surrogate_loss(mb_obs_tensor, mb_advs_tensor, mb_actions_tensor, pi_old) 106 | # comupte the surrogate gardient -> g, Ax = g, where A is the fisher information matrix 107 | surr_grad = torch.autograd.grad(surr_loss, self.net.actor.parameters()) 108 | flat_surr_grad = torch.cat([grad.view(-1) for grad in surr_grad]).data 109 | # use the conjugated gradient to calculate the scaled direction vector (natural gradient) 110 | nature_grad = conjugated_gradient(self._fisher_vector_product, -flat_surr_grad, 10, mb_obs_tensor, pi_old) 111 | # calculate the scaleing ratio 112 | non_scale_kl = 0.5 * (nature_grad * self._fisher_vector_product(nature_grad, mb_obs_tensor, pi_old)).sum(0, keepdim=True) 113 | scale_ratio = torch.sqrt(non_scale_kl / self.args.max_kl) 114 | final_nature_grad = nature_grad / scale_ratio[0] 115 | # calculate the expected improvement rate... 116 | expected_improve = (-flat_surr_grad * nature_grad).sum(0, keepdim=True) / scale_ratio[0] 117 | # get the flat param ... 118 | prev_params = torch.cat([param.data.view(-1) for param in self.net.actor.parameters()]) 119 | # start to do the line search 120 | success, new_params = line_search(self.net.actor, self._get_surrogate_loss, prev_params, final_nature_grad, \ 121 | expected_improve, mb_obs_tensor, mb_advs_tensor, mb_actions_tensor, pi_old) 122 | set_flat_params_to(self.net.actor, new_params) 123 | # then trying to update the critic network 124 | inds = np.arange(mb_obs.shape[0]) 125 | for _ in range(self.args.vf_itrs): 126 | np.random.shuffle(inds) 127 | for start in range(0, mb_obs.shape[0], self.args.batch_size): 128 | end = start + self.args.batch_size 129 | mbinds = inds[start:end] 130 | mini_obs = mb_obs[mbinds] 131 | mini_returns = mb_returns[mbinds] 132 | # put things in the tensor 133 | mini_obs = torch.tensor(mini_obs, dtype=torch.float32) 134 | mini_returns = torch.tensor(mini_returns, dtype=torch.float32).unsqueeze(1) 135 | values, _ = self.net(mini_obs) 136 | v_loss = (mini_returns - values).pow(2).mean() 137 | self.optimizer.zero_grad() 138 | v_loss.backward() 139 | self.optimizer.step() 140 | return surr_loss.item(), v_loss.item() 141 | 142 | # get the surrogate loss 143 | def _get_surrogate_loss(self, obs, adv, actions, pi_old): 144 | _, pi = self.net(obs) 145 | log_prob = eval_actions(pi, actions) 146 | old_log_prob = eval_actions(pi_old, actions).detach() 147 | surr_loss = -torch.exp(log_prob - old_log_prob) * adv 148 | return surr_loss.mean() 149 | 150 | # the product of the fisher informaiton matrix and the nature gradient -> Ax 151 | def _fisher_vector_product(self, v, obs, pi_old): 152 | kl = self._get_kl(obs, pi_old) 153 | kl = kl.mean() 154 | # start to calculate the second order gradient of the KL 155 | kl_grads = torch.autograd.grad(kl, self.net.actor.parameters(), create_graph=True) 156 | flat_kl_grads = torch.cat([grad.view(-1) for grad in kl_grads]) 157 | kl_v = (flat_kl_grads * torch.autograd.Variable(v)).sum() 158 | kl_second_grads = torch.autograd.grad(kl_v, self.net.actor.parameters()) 159 | flat_kl_second_grads = torch.cat([grad.contiguous().view(-1) for grad in kl_second_grads]).data 160 | flat_kl_second_grads = flat_kl_second_grads + self.args.damping * v 161 | return flat_kl_second_grads 162 | 163 | # get the kl divergence between two distributions 164 | def _get_kl(self, obs, pi_old): 165 | mean_old, std_old = pi_old 166 | _, pi = self.net(obs) 167 | mean, std = pi 168 | # start to calculate the kl-divergence 169 | kl = -torch.log(std / std_old) + (std.pow(2) + (mean - mean_old).pow(2)) / (2 * std_old.pow(2)) - 0.5 170 | return kl.sum(1, keepdim=True) 171 | 172 | # get the tensors 173 | def _get_tensors(self, obs): 174 | return torch.tensor(obs, dtype=torch.float32).unsqueeze(0) 175 | -------------------------------------------------------------------------------- /rl_algorithms/sac/sac_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from models import flatten_mlp, tanh_gaussian_actor 4 | from rl_utils.experience_replay.experience_replay import replay_buffer 5 | from utils import get_action_info 6 | from datetime import datetime 7 | import copy 8 | import os 9 | import gym 10 | 11 | 12 | """ 13 | 2019-Nov-12 - start to add the automatically tempature tuning 14 | 15 | 2019-JUN-05 16 | 17 | author: Tianhong Dai 18 | 19 | """ 20 | 21 | # the soft-actor-critic agent 22 | class sac_agent: 23 | def __init__(self, env, args): 24 | self.args = args 25 | self.env = env 26 | # create eval environment 27 | self.eval_env = gym.make(self.args.env_name) 28 | self.eval_env.seed(args.seed * 2) 29 | # build up the network that will be used. 30 | self.qf1 = flatten_mlp(self.env.observation_space.shape[0], self.args.hidden_size, self.env.action_space.shape[0]) 31 | self.qf2 = flatten_mlp(self.env.observation_space.shape[0], self.args.hidden_size, self.env.action_space.shape[0]) 32 | # set the target q functions 33 | self.target_qf1 = copy.deepcopy(self.qf1) 34 | self.target_qf2 = copy.deepcopy(self.qf2) 35 | # build up the policy network 36 | self.actor_net = tanh_gaussian_actor(self.env.observation_space.shape[0], self.env.action_space.shape[0], self.args.hidden_size, \ 37 | self.args.log_std_min, self.args.log_std_max) 38 | # define the optimizer for them 39 | self.qf1_optim = torch.optim.Adam(self.qf1.parameters(), lr=self.args.q_lr) 40 | self.qf2_optim = torch.optim.Adam(self.qf2.parameters(), lr=self.args.q_lr) 41 | # the optimizer for the policy network 42 | self.actor_optim = torch.optim.Adam(self.actor_net.parameters(), lr=self.args.p_lr) 43 | # entorpy target 44 | self.target_entropy = -np.prod(self.env.action_space.shape).item() 45 | self.log_alpha = torch.zeros(1, requires_grad=True, device='cuda' if self.args.cuda else 'cpu') 46 | # define the optimizer 47 | self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self.args.p_lr) 48 | # define the replay buffer 49 | self.buffer = replay_buffer(self.args.buffer_size) 50 | # get the action max 51 | self.action_max = self.env.action_space.high[0] 52 | # if use cuda, put tensor onto the gpu 53 | if self.args.cuda: 54 | self.actor_net.cuda() 55 | self.qf1.cuda() 56 | self.qf2.cuda() 57 | self.target_qf1.cuda() 58 | self.target_qf2.cuda() 59 | # automatically create the folders to save models 60 | if not os.path.exists(self.args.save_dir): 61 | os.mkdir(self.args.save_dir) 62 | self.model_path = os.path.join(self.args.save_dir, self.args.env_name) 63 | if not os.path.exists(self.model_path): 64 | os.mkdir(self.model_path) 65 | 66 | # train the agent 67 | def learn(self): 68 | global_timesteps = 0 69 | # before the official training, do the initial exploration to add episodes into the replay buffer 70 | self._initial_exploration(exploration_policy=self.args.init_exploration_policy) 71 | # reset the environment 72 | obs = self.env.reset() 73 | for epoch in range(self.args.n_epochs): 74 | for _ in range(self.args.train_loop_per_epoch): 75 | # for each epoch, it will reset the environment 76 | for t in range(self.args.epoch_length): 77 | # start to collect samples 78 | with torch.no_grad(): 79 | obs_tensor = self._get_tensor_inputs(obs) 80 | pi = self.actor_net(obs_tensor) 81 | action = get_action_info(pi, cuda=self.args.cuda).select_actions(reparameterize=False) 82 | action = action.cpu().numpy()[0] 83 | # input the actions into the environment 84 | obs_, reward, done, _ = self.env.step(self.action_max * action) 85 | # store the samples 86 | self.buffer.add(obs, action, reward, obs_, float(done)) 87 | # reassign the observations 88 | obs = obs_ 89 | if done: 90 | # reset the environment 91 | obs = self.env.reset() 92 | # after collect the samples, start to update the network 93 | for _ in range(self.args.update_cycles): 94 | qf1_loss, qf2_loss, actor_loss, alpha, alpha_loss = self._update_newtork() 95 | # update the target network 96 | if global_timesteps % self.args.target_update_interval == 0: 97 | self._update_target_network(self.target_qf1, self.qf1) 98 | self._update_target_network(self.target_qf2, self.qf2) 99 | global_timesteps += 1 100 | # print the log information 101 | if epoch % self.args.display_interval == 0: 102 | # start to do the evaluation 103 | mean_rewards = self._evaluate_agent() 104 | print('[{}] Epoch: {} / {}, Frames: {}, Rewards: {:.3f}, QF1: {:.3f}, QF2: {:.3f}, AL: {:.3f}, Alpha: {:.5f}, AlphaL: {:.5f}'.format(datetime.now(), \ 105 | epoch, self.args.n_epochs, (epoch + 1) * self.args.epoch_length, mean_rewards, qf1_loss, qf2_loss, actor_loss, alpha, alpha_loss)) 106 | # save models 107 | torch.save(self.actor_net.state_dict(), self.model_path + '/model.pt') 108 | 109 | # do the initial exploration by using the uniform policy 110 | def _initial_exploration(self, exploration_policy='gaussian'): 111 | # get the action information of the environment 112 | obs = self.env.reset() 113 | for _ in range(self.args.init_exploration_steps): 114 | if exploration_policy == 'uniform': 115 | raise NotImplementedError 116 | elif exploration_policy == 'gaussian': 117 | # the sac does not need normalize? 118 | with torch.no_grad(): 119 | obs_tensor = self._get_tensor_inputs(obs) 120 | # generate the policy 121 | pi = self.actor_net(obs_tensor) 122 | action = get_action_info(pi).select_actions(reparameterize=False) 123 | action = action.cpu().numpy()[0] 124 | # input the action input the environment 125 | obs_, reward, done, _ = self.env.step(self.action_max * action) 126 | # store the episodes 127 | self.buffer.add(obs, action, reward, obs_, float(done)) 128 | obs = obs_ 129 | if done: 130 | # if done, reset the environment 131 | obs = self.env.reset() 132 | print("Initial exploration has been finished!") 133 | # get tensors 134 | def _get_tensor_inputs(self, obs): 135 | obs_tensor = torch.tensor(obs, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu').unsqueeze(0) 136 | return obs_tensor 137 | 138 | # update the network 139 | def _update_newtork(self): 140 | # smaple batch of samples from the replay buffer 141 | obses, actions, rewards, obses_, dones = self.buffer.sample(self.args.batch_size) 142 | # preprocessing the data into the tensors, will support GPU later 143 | obses = torch.tensor(obses, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu') 144 | actions = torch.tensor(actions, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu') 145 | rewards = torch.tensor(rewards, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu').unsqueeze(-1) 146 | obses_ = torch.tensor(obses_, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu') 147 | inverse_dones = torch.tensor(1 - dones, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu').unsqueeze(-1) 148 | # start to update the actor network 149 | pis = self.actor_net(obses) 150 | actions_info = get_action_info(pis, cuda=self.args.cuda) 151 | actions_, pre_tanh_value = actions_info.select_actions(reparameterize=True) 152 | log_prob = actions_info.get_log_prob(actions_, pre_tanh_value) 153 | # use the automatically tuning 154 | alpha_loss = -(self.log_alpha * (log_prob + self.target_entropy).detach()).mean() 155 | self.alpha_optim.zero_grad() 156 | alpha_loss.backward() 157 | self.alpha_optim.step() 158 | # get the param 159 | alpha = self.log_alpha.exp() 160 | # get the q_value for new actions 161 | q_actions_ = torch.min(self.qf1(obses, actions_), self.qf2(obses, actions_)) 162 | actor_loss = (alpha * log_prob - q_actions_).mean() 163 | # q value function loss 164 | q1_value = self.qf1(obses, actions) 165 | q2_value = self.qf2(obses, actions) 166 | with torch.no_grad(): 167 | pis_next = self.actor_net(obses_) 168 | actions_info_next = get_action_info(pis_next, cuda=self.args.cuda) 169 | actions_next_, pre_tanh_value_next = actions_info_next.select_actions(reparameterize=True) 170 | log_prob_next = actions_info_next.get_log_prob(actions_next_, pre_tanh_value_next) 171 | target_q_value_next = torch.min(self.target_qf1(obses_, actions_next_), self.target_qf2(obses_, actions_next_)) - alpha * log_prob_next 172 | target_q_value = self.args.reward_scale * rewards + inverse_dones * self.args.gamma * target_q_value_next 173 | qf1_loss = (q1_value - target_q_value).pow(2).mean() 174 | qf2_loss = (q2_value - target_q_value).pow(2).mean() 175 | # qf1 176 | self.qf1_optim.zero_grad() 177 | qf1_loss.backward() 178 | self.qf1_optim.step() 179 | # qf2 180 | self.qf2_optim.zero_grad() 181 | qf2_loss.backward() 182 | self.qf2_optim.step() 183 | # policy loss 184 | self.actor_optim.zero_grad() 185 | actor_loss.backward() 186 | self.actor_optim.step() 187 | return qf1_loss.item(), qf2_loss.item(), actor_loss.item(), alpha.item(), alpha_loss.item() 188 | 189 | # update the target network 190 | def _update_target_network(self, target, source): 191 | for target_param, param in zip(target.parameters(), source.parameters()): 192 | target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) 193 | 194 | # evaluate the agent 195 | def _evaluate_agent(self): 196 | total_reward = 0 197 | for _ in range(self.args.eval_episodes): 198 | obs = self.eval_env.reset() 199 | episode_reward = 0 200 | while True: 201 | with torch.no_grad(): 202 | obs_tensor = self._get_tensor_inputs(obs) 203 | pi = self.actor_net(obs_tensor) 204 | action = get_action_info(pi, cuda=self.args.cuda).select_actions(exploration=False, reparameterize=False) 205 | action = action.detach().cpu().numpy()[0] 206 | # input the action into the environment 207 | obs_, reward, done, _ = self.eval_env.step(self.action_max * action) 208 | episode_reward += reward 209 | if done: 210 | break 211 | obs = obs_ 212 | total_reward += episode_reward 213 | return total_reward / self.args.eval_episodes 214 | -------------------------------------------------------------------------------- /rl_utils/env_wrapper/atari_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | os.environ.setdefault('PATH', '') 4 | from collections import deque 5 | import gym 6 | from gym import spaces 7 | import cv2 8 | cv2.ocl.setUseOpenCL(False) 9 | 10 | """ 11 | the wrapper is taken from the openai baselines 12 | 13 | """ 14 | 15 | class NoopResetEnv(gym.Wrapper): 16 | def __init__(self, env, noop_max=30): 17 | """Sample initial states by taking random number of no-ops on reset. 18 | No-op is assumed to be action 0. 19 | """ 20 | gym.Wrapper.__init__(self, env) 21 | self.noop_max = noop_max 22 | self.override_num_noops = None 23 | self.noop_action = 0 24 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 25 | 26 | def reset(self, **kwargs): 27 | """ Do no-op action for a number of steps in [1, noop_max].""" 28 | self.env.reset(**kwargs) 29 | if self.override_num_noops is not None: 30 | noops = self.override_num_noops 31 | else: 32 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 33 | assert noops > 0 34 | obs = None 35 | for _ in range(noops): 36 | obs, _, done, _ = self.env.step(self.noop_action) 37 | if done: 38 | obs = self.env.reset(**kwargs) 39 | return obs 40 | 41 | def step(self, ac): 42 | return self.env.step(ac) 43 | 44 | class FireResetEnv(gym.Wrapper): 45 | def __init__(self, env): 46 | """Take action on reset for environments that are fixed until firing.""" 47 | gym.Wrapper.__init__(self, env) 48 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 49 | assert len(env.unwrapped.get_action_meanings()) >= 3 50 | 51 | def reset(self, **kwargs): 52 | self.env.reset(**kwargs) 53 | obs, _, done, _ = self.env.step(1) 54 | if done: 55 | self.env.reset(**kwargs) 56 | obs, _, done, _ = self.env.step(2) 57 | if done: 58 | self.env.reset(**kwargs) 59 | return obs 60 | 61 | def step(self, ac): 62 | return self.env.step(ac) 63 | 64 | class EpisodicLifeEnv(gym.Wrapper): 65 | def __init__(self, env): 66 | """Make end-of-life == end-of-episode, but only reset on true game over. 67 | Done by DeepMind for the DQN and co. since it helps value estimation. 68 | """ 69 | gym.Wrapper.__init__(self, env) 70 | self.lives = 0 71 | self.was_real_done = True 72 | 73 | def step(self, action): 74 | obs, reward, done, info = self.env.step(action) 75 | self.was_real_done = done 76 | # check current lives, make loss of life terminal, 77 | # then update lives to handle bonus lives 78 | lives = self.env.unwrapped.ale.lives() 79 | if lives < self.lives and lives > 0: 80 | # for Qbert sometimes we stay in lives == 0 condition for a few frames 81 | # so it's important to keep lives > 0, so that we only reset once 82 | # the environment advertises done. 83 | done = True 84 | self.lives = lives 85 | return obs, reward, done, info 86 | 87 | def reset(self, **kwargs): 88 | """Reset only when lives are exhausted. 89 | This way all states are still reachable even though lives are episodic, 90 | and the learner need not know about any of this behind-the-scenes. 91 | """ 92 | if self.was_real_done: 93 | obs = self.env.reset(**kwargs) 94 | else: 95 | # no-op step to advance from terminal/lost life state 96 | obs, _, _, _ = self.env.step(0) 97 | self.lives = self.env.unwrapped.ale.lives() 98 | return obs 99 | 100 | class MaxAndSkipEnv(gym.Wrapper): 101 | def __init__(self, env, skip=4): 102 | """Return only every `skip`-th frame""" 103 | gym.Wrapper.__init__(self, env) 104 | # most recent raw observations (for max pooling across time steps) 105 | self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) 106 | self._skip = skip 107 | 108 | def step(self, action): 109 | """Repeat action, sum reward, and max over last observations.""" 110 | total_reward = 0.0 111 | done = None 112 | for i in range(self._skip): 113 | obs, reward, done, info = self.env.step(action) 114 | if i == self._skip - 2: self._obs_buffer[0] = obs 115 | if i == self._skip - 1: self._obs_buffer[1] = obs 116 | total_reward += reward 117 | if done: 118 | break 119 | # Note that the observation on the done=True frame 120 | # doesn't matter 121 | max_frame = self._obs_buffer.max(axis=0) 122 | 123 | return max_frame, total_reward, done, info 124 | 125 | def reset(self, **kwargs): 126 | return self.env.reset(**kwargs) 127 | 128 | class ClipRewardEnv(gym.RewardWrapper): 129 | def __init__(self, env): 130 | gym.RewardWrapper.__init__(self, env) 131 | 132 | def reward(self, reward): 133 | """Bin reward to {+1, 0, -1} by its sign.""" 134 | return np.sign(reward) 135 | 136 | 137 | class WarpFrame(gym.ObservationWrapper): 138 | def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None): 139 | """ 140 | Warp frames to 84x84 as done in the Nature paper and later work. 141 | 142 | If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which 143 | observation should be warped. 144 | """ 145 | super().__init__(env) 146 | self._width = width 147 | self._height = height 148 | self._grayscale = grayscale 149 | self._key = dict_space_key 150 | if self._grayscale: 151 | num_colors = 1 152 | else: 153 | num_colors = 3 154 | 155 | new_space = gym.spaces.Box( 156 | low=0, 157 | high=255, 158 | shape=(self._height, self._width, num_colors), 159 | dtype=np.uint8, 160 | ) 161 | if self._key is None: 162 | original_space = self.observation_space 163 | self.observation_space = new_space 164 | else: 165 | original_space = self.observation_space.spaces[self._key] 166 | self.observation_space.spaces[self._key] = new_space 167 | assert original_space.dtype == np.uint8 and len(original_space.shape) == 3 168 | 169 | def observation(self, obs): 170 | if self._key is None: 171 | frame = obs 172 | else: 173 | frame = obs[self._key] 174 | 175 | if self._grayscale: 176 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 177 | frame = cv2.resize( 178 | frame, (self._width, self._height), interpolation=cv2.INTER_AREA 179 | ) 180 | if self._grayscale: 181 | frame = np.expand_dims(frame, -1) 182 | 183 | if self._key is None: 184 | obs = frame 185 | else: 186 | obs = obs.copy() 187 | obs[self._key] = frame 188 | return obs 189 | 190 | 191 | class FrameStack(gym.Wrapper): 192 | def __init__(self, env, k): 193 | """Stack k last frames. 194 | 195 | Returns lazy array, which is much more memory efficient. 196 | 197 | See Also 198 | -------- 199 | baselines.common.atari_wrappers.LazyFrames 200 | """ 201 | gym.Wrapper.__init__(self, env) 202 | self.k = k 203 | self.frames = deque([], maxlen=k) 204 | shp = env.observation_space.shape 205 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype) 206 | 207 | def reset(self): 208 | ob = self.env.reset() 209 | for _ in range(self.k): 210 | self.frames.append(ob) 211 | return self._get_ob() 212 | 213 | def step(self, action): 214 | ob, reward, done, info = self.env.step(action) 215 | self.frames.append(ob) 216 | return self._get_ob(), reward, done, info 217 | 218 | def _get_ob(self): 219 | assert len(self.frames) == self.k 220 | return LazyFrames(list(self.frames)) 221 | 222 | class ScaledFloatFrame(gym.ObservationWrapper): 223 | def __init__(self, env): 224 | gym.ObservationWrapper.__init__(self, env) 225 | self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32) 226 | 227 | def observation(self, observation): 228 | # careful! This undoes the memory optimization, use 229 | # with smaller replay buffers only. 230 | return np.array(observation).astype(np.float32) / 255.0 231 | 232 | class LazyFrames(object): 233 | def __init__(self, frames): 234 | """This object ensures that common frames between the observations are only stored once. 235 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 236 | buffers. 237 | 238 | This object should only be converted to numpy array before being passed to the model. 239 | 240 | You'd not believe how complex the previous solution was.""" 241 | self._frames = frames 242 | self._out = None 243 | 244 | def _force(self): 245 | if self._out is None: 246 | self._out = np.concatenate(self._frames, axis=-1) 247 | self._frames = None 248 | return self._out 249 | 250 | def __array__(self, dtype=None): 251 | out = self._force() 252 | if dtype is not None: 253 | out = out.astype(dtype) 254 | return out 255 | 256 | def __len__(self): 257 | return len(self._force()) 258 | 259 | def __getitem__(self, i): 260 | return self._force()[i] 261 | 262 | def count(self): 263 | frames = self._force() 264 | return frames.shape[frames.ndim - 1] 265 | 266 | def frame(self, i): 267 | return self._force()[..., i] 268 | 269 | def make_atari(env_id, max_episode_steps=None): 270 | env = gym.make(env_id) 271 | assert 'NoFrameskip' in env.spec.id 272 | env = NoopResetEnv(env, noop_max=30) 273 | env = MaxAndSkipEnv(env, skip=4) 274 | if max_episode_steps is not None: 275 | env = TimeLimit(env, max_episode_steps=max_episode_steps) 276 | return env 277 | 278 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): 279 | """Configure environment for DeepMind-style Atari. 280 | """ 281 | if episode_life: 282 | env = EpisodicLifeEnv(env) 283 | if 'FIRE' in env.unwrapped.get_action_meanings(): 284 | env = FireResetEnv(env) 285 | env = WarpFrame(env) 286 | if scale: 287 | env = ScaledFloatFrame(env) 288 | if clip_rewards: 289 | env = ClipRewardEnv(env) 290 | if frame_stack: 291 | env = FrameStack(env, 4) 292 | return env 293 | 294 | # time limit 295 | class TimeLimit(gym.Wrapper): 296 | def __init__(self, env, max_episode_steps=None): 297 | super(TimeLimit, self).__init__(env) 298 | self._max_episode_steps = max_episode_steps 299 | self._elapsed_steps = 0 300 | 301 | def step(self, ac): 302 | observation, reward, done, info = self.env.step(ac) 303 | self._elapsed_steps += 1 304 | if self._elapsed_steps >= self._max_episode_steps: 305 | done = True 306 | info['TimeLimit.truncated'] = True 307 | return observation, reward, done, info 308 | 309 | def reset(self, **kwargs): 310 | self._elapsed_steps = 0 311 | return self.env.reset(**kwargs) 312 | -------------------------------------------------------------------------------- /rl_algorithms/ppo/ppo_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import optim 4 | from rl_utils.running_filter.running_filter import ZFilter 5 | from models import cnn_net, mlp_net 6 | from utils import select_actions, evaluate_actions 7 | from datetime import datetime 8 | import os 9 | import copy 10 | 11 | class ppo_agent: 12 | def __init__(self, envs, args): 13 | self.envs = envs 14 | self.args = args 15 | # start to build the network. 16 | if self.args.env_type == 'atari': 17 | self.net = cnn_net(envs.action_space.n) 18 | elif self.args.env_type == 'mujoco': 19 | self.net = mlp_net(envs.observation_space.shape[0], envs.action_space.shape[0], self.args.dist) 20 | self.old_net = copy.deepcopy(self.net) 21 | # if use the cuda... 22 | if self.args.cuda: 23 | self.net.cuda() 24 | self.old_net.cuda() 25 | # define the optimizer... 26 | self.optimizer = optim.Adam(self.net.parameters(), self.args.lr, eps=self.args.eps) 27 | # running filter... 28 | if self.args.env_type == 'mujoco': 29 | num_states = self.envs.observation_space.shape[0] 30 | self.running_state = ZFilter((num_states, ), clip=5) 31 | # check saving folder.. 32 | if not os.path.exists(self.args.save_dir): 33 | os.mkdir(self.args.save_dir) 34 | # env folder.. 35 | self.model_path = os.path.join(self.args.save_dir, self.args.env_name) 36 | if not os.path.exists(self.model_path): 37 | os.mkdir(self.model_path) 38 | # get the observation 39 | self.batch_ob_shape = (self.args.num_workers * self.args.nsteps, ) + self.envs.observation_space.shape 40 | self.obs = np.zeros((self.args.num_workers, ) + self.envs.observation_space.shape, dtype=self.envs.observation_space.dtype.name) 41 | if self.args.env_type == 'mujoco': 42 | self.obs[:] = np.expand_dims(self.running_state(self.envs.reset()), 0) 43 | else: 44 | self.obs[:] = self.envs.reset() 45 | self.dones = [False for _ in range(self.args.num_workers)] 46 | 47 | # start to train the network... 48 | def learn(self): 49 | num_updates = self.args.total_frames // (self.args.nsteps * self.args.num_workers) 50 | # get the reward to calculate other informations 51 | episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) 52 | final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) 53 | for update in range(num_updates): 54 | mb_obs, mb_rewards, mb_actions, mb_dones, mb_values = [], [], [], [], [] 55 | if self.args.lr_decay: 56 | self._adjust_learning_rate(update, num_updates) 57 | for step in range(self.args.nsteps): 58 | with torch.no_grad(): 59 | # get tensors 60 | obs_tensor = self._get_tensors(self.obs) 61 | values, pis = self.net(obs_tensor) 62 | # select actions 63 | actions = select_actions(pis, self.args.dist, self.args.env_type) 64 | if self.args.env_type == 'atari': 65 | input_actions = actions 66 | else: 67 | if self.args.dist == 'gauss': 68 | input_actions = actions.copy() 69 | elif self.args.dist == 'beta': 70 | input_actions = -1 + 2 * actions 71 | # start to store information 72 | mb_obs.append(np.copy(self.obs)) 73 | mb_actions.append(actions) 74 | mb_dones.append(self.dones) 75 | mb_values.append(values.detach().cpu().numpy().squeeze()) 76 | # start to excute the actions in the environment 77 | obs, rewards, dones, _ = self.envs.step(input_actions) 78 | # update dones 79 | if self.args.env_type == 'mujoco': 80 | dones = np.array([dones]) 81 | rewards = np.array([rewards]) 82 | self.dones = dones 83 | mb_rewards.append(rewards) 84 | # clear the observation 85 | for n, done in enumerate(dones): 86 | if done: 87 | self.obs[n] = self.obs[n] * 0 88 | if self.args.env_type == 'mujoco': 89 | # reset the environment 90 | obs = self.envs.reset() 91 | self.obs = obs if self.args.env_type == 'atari' else np.expand_dims(self.running_state(obs), 0) 92 | # process the rewards part -- display the rewards on the screen 93 | episode_rewards += rewards 94 | masks = np.array([0.0 if done_ else 1.0 for done_ in dones], dtype=np.float32) 95 | final_rewards *= masks 96 | final_rewards += (1 - masks) * episode_rewards 97 | episode_rewards *= masks 98 | # process the rollouts 99 | mb_obs = np.asarray(mb_obs, dtype=np.float32) 100 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32) 101 | mb_actions = np.asarray(mb_actions, dtype=np.float32) 102 | mb_dones = np.asarray(mb_dones, dtype=np.bool) 103 | mb_values = np.asarray(mb_values, dtype=np.float32) 104 | if self.args.env_type == 'mujoco': 105 | mb_values = np.expand_dims(mb_values, 1) 106 | # compute the last state value 107 | with torch.no_grad(): 108 | obs_tensor = self._get_tensors(self.obs) 109 | last_values, _ = self.net(obs_tensor) 110 | last_values = last_values.detach().cpu().numpy().squeeze() 111 | # start to compute advantages... 112 | mb_returns = np.zeros_like(mb_rewards) 113 | mb_advs = np.zeros_like(mb_rewards) 114 | lastgaelam = 0 115 | for t in reversed(range(self.args.nsteps)): 116 | if t == self.args.nsteps - 1: 117 | nextnonterminal = 1.0 - self.dones 118 | nextvalues = last_values 119 | else: 120 | nextnonterminal = 1.0 - mb_dones[t + 1] 121 | nextvalues = mb_values[t + 1] 122 | delta = mb_rewards[t] + self.args.gamma * nextvalues * nextnonterminal - mb_values[t] 123 | mb_advs[t] = lastgaelam = delta + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam 124 | mb_returns = mb_advs + mb_values 125 | # after compute the returns, let's process the rollouts 126 | mb_obs = mb_obs.swapaxes(0, 1).reshape(self.batch_ob_shape) 127 | if self.args.env_type == 'atari': 128 | mb_actions = mb_actions.swapaxes(0, 1).flatten() 129 | mb_returns = mb_returns.swapaxes(0, 1).flatten() 130 | mb_advs = mb_advs.swapaxes(0, 1).flatten() 131 | # before update the network, the old network will try to load the weights 132 | self.old_net.load_state_dict(self.net.state_dict()) 133 | # start to update the network 134 | pl, vl, ent = self._update_network(mb_obs, mb_actions, mb_returns, mb_advs) 135 | # display the training information 136 | if update % self.args.display_interval == 0: 137 | print('[{}] Update: {} / {}, Frames: {}, Rewards: {:.3f}, Min: {:.3f}, Max: {:.3f}, PL: {:.3f},'\ 138 | 'VL: {:.3f}, Ent: {:.3f}'.format(datetime.now(), update, num_updates, (update + 1)*self.args.nsteps*self.args.num_workers, \ 139 | final_rewards.mean(), final_rewards.min(), final_rewards.max(), pl, vl, ent)) 140 | # save the model 141 | if self.args.env_type == 'atari': 142 | torch.save(self.net.state_dict(), self.model_path + '/model.pt') 143 | else: 144 | # for the mujoco, we also need to keep the running mean filter! 145 | torch.save([self.net.state_dict(), self.running_state], self.model_path + '/model.pt') 146 | 147 | # update the network 148 | def _update_network(self, obs, actions, returns, advantages): 149 | inds = np.arange(obs.shape[0]) 150 | nbatch_train = obs.shape[0] // self.args.batch_size 151 | for _ in range(self.args.epoch): 152 | np.random.shuffle(inds) 153 | for start in range(0, obs.shape[0], nbatch_train): 154 | # get the mini-batchs 155 | end = start + nbatch_train 156 | mbinds = inds[start:end] 157 | mb_obs = obs[mbinds] 158 | mb_actions = actions[mbinds] 159 | mb_returns = returns[mbinds] 160 | mb_advs = advantages[mbinds] 161 | # convert minibatches to tensor 162 | mb_obs = self._get_tensors(mb_obs) 163 | mb_actions = torch.tensor(mb_actions, dtype=torch.float32) 164 | mb_returns = torch.tensor(mb_returns, dtype=torch.float32).unsqueeze(1) 165 | mb_advs = torch.tensor(mb_advs, dtype=torch.float32).unsqueeze(1) 166 | # normalize adv 167 | mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-8) 168 | if self.args.cuda: 169 | mb_actions = mb_actions.cuda() 170 | mb_returns = mb_returns.cuda() 171 | mb_advs = mb_advs.cuda() 172 | # start to get values 173 | mb_values, pis = self.net(mb_obs) 174 | # start to calculate the value loss... 175 | value_loss = (mb_returns - mb_values).pow(2).mean() 176 | # start to calculate the policy loss 177 | with torch.no_grad(): 178 | _, old_pis = self.old_net(mb_obs) 179 | # get the old log probs 180 | old_log_prob, _ = evaluate_actions(old_pis, mb_actions, self.args.dist, self.args.env_type) 181 | old_log_prob = old_log_prob.detach() 182 | # evaluate the current policy 183 | log_prob, ent_loss = evaluate_actions(pis, mb_actions, self.args.dist, self.args.env_type) 184 | prob_ratio = torch.exp(log_prob - old_log_prob) 185 | # surr1 186 | surr1 = prob_ratio * mb_advs 187 | surr2 = torch.clamp(prob_ratio, 1 - self.args.clip, 1 + self.args.clip) * mb_advs 188 | policy_loss = -torch.min(surr1, surr2).mean() 189 | # final total loss 190 | total_loss = policy_loss + self.args.vloss_coef * value_loss - ent_loss * self.args.ent_coef 191 | # clear the grad buffer 192 | self.optimizer.zero_grad() 193 | total_loss.backward() 194 | torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.args.max_grad_norm) 195 | # update 196 | self.optimizer.step() 197 | return policy_loss.item(), value_loss.item(), ent_loss.item() 198 | 199 | # convert the numpy array to tensors 200 | def _get_tensors(self, obs): 201 | if self.args.env_type == 'atari': 202 | obs_tensor = torch.tensor(np.transpose(obs, (0, 3, 1, 2)), dtype=torch.float32) 203 | else: 204 | obs_tensor = torch.tensor(obs, dtype=torch.float32) 205 | # decide if put the tensor on the GPU 206 | if self.args.cuda: 207 | obs_tensor = obs_tensor.cuda() 208 | return obs_tensor 209 | 210 | # adjust the learning rate 211 | def _adjust_learning_rate(self, update, num_updates): 212 | lr_frac = 1 - (update / num_updates) 213 | adjust_lr = self.args.lr * lr_frac 214 | for param_group in self.optimizer.param_groups: 215 | param_group['lr'] = adjust_lr 216 | -------------------------------------------------------------------------------- /rl_utils/logger/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import os.path as osp 5 | import json 6 | import time 7 | import datetime 8 | import tempfile 9 | from collections import defaultdict 10 | from contextlib import contextmanager 11 | 12 | DEBUG = 10 13 | INFO = 20 14 | WARN = 30 15 | ERROR = 40 16 | 17 | DISABLED = 50 18 | 19 | class KVWriter(object): 20 | def writekvs(self, kvs): 21 | raise NotImplementedError 22 | 23 | class SeqWriter(object): 24 | def writeseq(self, seq): 25 | raise NotImplementedError 26 | 27 | class HumanOutputFormat(KVWriter, SeqWriter): 28 | def __init__(self, filename_or_file): 29 | if isinstance(filename_or_file, str): 30 | self.file = open(filename_or_file, 'wt') 31 | self.own_file = True 32 | else: 33 | assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s'%filename_or_file 34 | self.file = filename_or_file 35 | self.own_file = False 36 | 37 | def writekvs(self, kvs): 38 | # Create strings for printing 39 | key2str = {} 40 | for (key, val) in sorted(kvs.items()): 41 | if hasattr(val, '__float__'): 42 | valstr = '%-8.3g' % val 43 | else: 44 | valstr = str(val) 45 | key2str[self._truncate(key)] = self._truncate(valstr) 46 | 47 | # Find max widths 48 | if len(key2str) == 0: 49 | print('WARNING: tried to write empty key-value dict') 50 | return 51 | else: 52 | keywidth = max(map(len, key2str.keys())) 53 | valwidth = max(map(len, key2str.values())) 54 | 55 | # Write out the data 56 | dashes = '-' * (keywidth + valwidth + 7) 57 | lines = [dashes] 58 | for (key, val) in sorted(key2str.items(), key=lambda kv: kv[0].lower()): 59 | lines.append('| %s%s | %s%s |' % ( 60 | key, 61 | ' ' * (keywidth - len(key)), 62 | val, 63 | ' ' * (valwidth - len(val)), 64 | )) 65 | lines.append(dashes) 66 | self.file.write('\n'.join(lines) + '\n') 67 | 68 | # Flush the output to the file 69 | self.file.flush() 70 | 71 | def _truncate(self, s): 72 | maxlen = 30 73 | return s[:maxlen-3] + '...' if len(s) > maxlen else s 74 | 75 | def writeseq(self, seq): 76 | seq = list(seq) 77 | for (i, elem) in enumerate(seq): 78 | self.file.write(elem) 79 | if i < len(seq) - 1: # add space unless this is the last one 80 | self.file.write(' ') 81 | self.file.write('\n') 82 | self.file.flush() 83 | 84 | def close(self): 85 | if self.own_file: 86 | self.file.close() 87 | 88 | class JSONOutputFormat(KVWriter): 89 | def __init__(self, filename): 90 | self.file = open(filename, 'wt') 91 | 92 | def writekvs(self, kvs): 93 | for k, v in sorted(kvs.items()): 94 | if hasattr(v, 'dtype'): 95 | kvs[k] = float(v) 96 | self.file.write(json.dumps(kvs) + '\n') 97 | self.file.flush() 98 | 99 | def close(self): 100 | self.file.close() 101 | 102 | class CSVOutputFormat(KVWriter): 103 | def __init__(self, filename): 104 | self.file = open(filename, 'w+t') 105 | self.keys = [] 106 | self.sep = ',' 107 | 108 | def writekvs(self, kvs): 109 | # Add our current row to the history 110 | extra_keys = list(kvs.keys() - self.keys) 111 | extra_keys.sort() 112 | if extra_keys: 113 | self.keys.extend(extra_keys) 114 | self.file.seek(0) 115 | lines = self.file.readlines() 116 | self.file.seek(0) 117 | for (i, k) in enumerate(self.keys): 118 | if i > 0: 119 | self.file.write(',') 120 | self.file.write(k) 121 | self.file.write('\n') 122 | for line in lines[1:]: 123 | self.file.write(line[:-1]) 124 | self.file.write(self.sep * len(extra_keys)) 125 | self.file.write('\n') 126 | for (i, k) in enumerate(self.keys): 127 | if i > 0: 128 | self.file.write(',') 129 | v = kvs.get(k) 130 | if v is not None: 131 | self.file.write(str(v)) 132 | self.file.write('\n') 133 | self.file.flush() 134 | 135 | def close(self): 136 | self.file.close() 137 | 138 | 139 | class TensorBoardOutputFormat(KVWriter): 140 | """ 141 | Dumps key/value pairs into TensorBoard's numeric format. 142 | """ 143 | def __init__(self, dir): 144 | os.makedirs(dir, exist_ok=True) 145 | self.dir = dir 146 | self.step = 1 147 | prefix = 'events' 148 | path = osp.join(osp.abspath(dir), prefix) 149 | import tensorflow as tf 150 | from tensorflow.python import pywrap_tensorflow 151 | from tensorflow.core.util import event_pb2 152 | from tensorflow.python.util import compat 153 | self.tf = tf 154 | self.event_pb2 = event_pb2 155 | self.pywrap_tensorflow = pywrap_tensorflow 156 | self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path)) 157 | 158 | def writekvs(self, kvs): 159 | def summary_val(k, v): 160 | kwargs = {'tag': k, 'simple_value': float(v)} 161 | return self.tf.Summary.Value(**kwargs) 162 | summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()]) 163 | event = self.event_pb2.Event(wall_time=time.time(), summary=summary) 164 | event.step = self.step # is there any reason why you'd want to specify the step? 165 | self.writer.WriteEvent(event) 166 | self.writer.Flush() 167 | self.step += 1 168 | 169 | def close(self): 170 | if self.writer: 171 | self.writer.Close() 172 | self.writer = None 173 | 174 | def make_output_format(format, ev_dir, log_suffix=''): 175 | os.makedirs(ev_dir, exist_ok=True) 176 | if format == 'stdout': 177 | return HumanOutputFormat(sys.stdout) 178 | elif format == 'log': 179 | return HumanOutputFormat(osp.join(ev_dir, 'log%s.txt' % log_suffix)) 180 | elif format == 'json': 181 | return JSONOutputFormat(osp.join(ev_dir, 'progress%s.json' % log_suffix)) 182 | elif format == 'csv': 183 | return CSVOutputFormat(osp.join(ev_dir, 'progress%s.csv' % log_suffix)) 184 | elif format == 'tensorboard': 185 | return TensorBoardOutputFormat(osp.join(ev_dir, 'tb%s' % log_suffix)) 186 | else: 187 | raise ValueError('Unknown format specified: %s' % (format,)) 188 | 189 | # ================================================================ 190 | # API 191 | # ================================================================ 192 | 193 | def logkv(key, val): 194 | """ 195 | Log a value of some diagnostic 196 | Call this once for each diagnostic quantity, each iteration 197 | If called many times, last value will be used. 198 | """ 199 | get_current().logkv(key, val) 200 | 201 | def logkv_mean(key, val): 202 | """ 203 | The same as logkv(), but if called many times, values averaged. 204 | """ 205 | get_current().logkv_mean(key, val) 206 | 207 | def logkvs(d): 208 | """ 209 | Log a dictionary of key-value pairs 210 | """ 211 | for (k, v) in d.items(): 212 | logkv(k, v) 213 | 214 | def dumpkvs(): 215 | """ 216 | Write all of the diagnostics from the current iteration 217 | """ 218 | return get_current().dumpkvs() 219 | 220 | def getkvs(): 221 | return get_current().name2val 222 | 223 | 224 | def log(*args, level=INFO): 225 | """ 226 | Write the sequence of args, with no separators, to the console and output files (if you've configured an output file). 227 | """ 228 | get_current().log(*args, level=level) 229 | 230 | def debug(*args): 231 | log(*args, level=DEBUG) 232 | 233 | def info(*args): 234 | log(*args, level=INFO) 235 | 236 | def warn(*args): 237 | log(*args, level=WARN) 238 | 239 | def error(*args): 240 | log(*args, level=ERROR) 241 | 242 | 243 | def set_level(level): 244 | """ 245 | Set logging threshold on current logger. 246 | """ 247 | get_current().set_level(level) 248 | 249 | def set_comm(comm): 250 | get_current().set_comm(comm) 251 | 252 | def get_dir(): 253 | """ 254 | Get directory that log files are being written to. 255 | will be None if there is no output directory (i.e., if you didn't call start) 256 | """ 257 | return get_current().get_dir() 258 | 259 | record_tabular = logkv 260 | dump_tabular = dumpkvs 261 | 262 | @contextmanager 263 | def profile_kv(scopename): 264 | logkey = 'wait_' + scopename 265 | tstart = time.time() 266 | try: 267 | yield 268 | finally: 269 | get_current().name2val[logkey] += time.time() - tstart 270 | 271 | def profile(n): 272 | """ 273 | Usage: 274 | @profile("my_func") 275 | def my_func(): code 276 | """ 277 | def decorator_with_name(func): 278 | def func_wrapper(*args, **kwargs): 279 | with profile_kv(n): 280 | return func(*args, **kwargs) 281 | return func_wrapper 282 | return decorator_with_name 283 | 284 | 285 | # ================================================================ 286 | # Backend 287 | # ================================================================ 288 | 289 | def get_current(): 290 | if Logger.CURRENT is None: 291 | _configure_default_logger() 292 | 293 | return Logger.CURRENT 294 | 295 | 296 | class Logger(object): 297 | DEFAULT = None # A logger with no output files. (See right below class definition) 298 | # So that you can still log to the terminal without setting up any output files 299 | CURRENT = None # Current logger being used by the free functions above 300 | 301 | def __init__(self, dir, output_formats, comm=None): 302 | self.name2val = defaultdict(float) # values this iteration 303 | self.name2cnt = defaultdict(int) 304 | self.level = INFO 305 | self.dir = dir 306 | self.output_formats = output_formats 307 | self.comm = comm 308 | 309 | # Logging API, forwarded 310 | # ---------------------------------------- 311 | def logkv(self, key, val): 312 | self.name2val[key] = val 313 | 314 | def logkv_mean(self, key, val): 315 | oldval, cnt = self.name2val[key], self.name2cnt[key] 316 | self.name2val[key] = oldval*cnt/(cnt+1) + val/(cnt+1) 317 | self.name2cnt[key] = cnt + 1 318 | 319 | def dumpkvs(self): 320 | if self.comm is None: 321 | d = self.name2val 322 | else: 323 | from baselines.common import mpi_util 324 | d = mpi_util.mpi_weighted_mean(self.comm, 325 | {name : (val, self.name2cnt.get(name, 1)) 326 | for (name, val) in self.name2val.items()}) 327 | if self.comm.rank != 0: 328 | d['dummy'] = 1 # so we don't get a warning about empty dict 329 | out = d.copy() # Return the dict for unit testing purposes 330 | for fmt in self.output_formats: 331 | if isinstance(fmt, KVWriter): 332 | fmt.writekvs(d) 333 | self.name2val.clear() 334 | self.name2cnt.clear() 335 | return out 336 | 337 | def log(self, *args, level=INFO): 338 | if self.level <= level: 339 | self._do_log(args) 340 | 341 | # Configuration 342 | # ---------------------------------------- 343 | def set_level(self, level): 344 | self.level = level 345 | 346 | def set_comm(self, comm): 347 | self.comm = comm 348 | 349 | def get_dir(self): 350 | return self.dir 351 | 352 | def close(self): 353 | for fmt in self.output_formats: 354 | fmt.close() 355 | 356 | # Misc 357 | # ---------------------------------------- 358 | def _do_log(self, args): 359 | for fmt in self.output_formats: 360 | if isinstance(fmt, SeqWriter): 361 | fmt.writeseq(map(str, args)) 362 | 363 | def get_rank_without_mpi_import(): 364 | # check environment variables here instead of importing mpi4py 365 | # to avoid calling MPI_Init() when this module is imported 366 | for varname in ['PMI_RANK', 'OMPI_COMM_WORLD_RANK']: 367 | if varname in os.environ: 368 | return int(os.environ[varname]) 369 | return 0 370 | 371 | 372 | def configure(dir=None, format_strs=None, comm=None, log_suffix=''): 373 | """ 374 | If comm is provided, average all numerical stats across that comm 375 | """ 376 | if dir is None: 377 | dir = os.getenv('OPENAI_LOGDIR') 378 | if dir is None: 379 | dir = osp.join(tempfile.gettempdir(), 380 | datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) 381 | assert isinstance(dir, str) 382 | dir = os.path.expanduser(dir) 383 | os.makedirs(os.path.expanduser(dir), exist_ok=True) 384 | 385 | rank = get_rank_without_mpi_import() 386 | if rank > 0: 387 | log_suffix = log_suffix + "-rank%03i" % rank 388 | 389 | if format_strs is None: 390 | if rank == 0: 391 | format_strs = os.getenv('OPENAI_LOG_FORMAT', 'stdout,log,csv').split(',') 392 | else: 393 | format_strs = os.getenv('OPENAI_LOG_FORMAT_MPI', 'log').split(',') 394 | format_strs = filter(None, format_strs) 395 | output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs] 396 | 397 | Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm) 398 | if output_formats: 399 | log('Logging to %s'%dir) 400 | 401 | def _configure_default_logger(): 402 | configure() 403 | Logger.DEFAULT = Logger.CURRENT 404 | 405 | def reset(): 406 | if Logger.CURRENT is not Logger.DEFAULT: 407 | Logger.CURRENT.close() 408 | Logger.CURRENT = Logger.DEFAULT 409 | log('Reset logger') 410 | 411 | @contextmanager 412 | def scoped_configure(dir=None, format_strs=None, comm=None): 413 | prevlogger = Logger.CURRENT 414 | configure(dir=dir, format_strs=format_strs, comm=comm) 415 | try: 416 | yield 417 | finally: 418 | Logger.CURRENT.close() 419 | Logger.CURRENT = prevlogger 420 | 421 | # ================================================================ 422 | 423 | def _demo(): 424 | info("hi") 425 | debug("shouldn't appear") 426 | set_level(DEBUG) 427 | debug("should appear") 428 | dir = "/tmp/testlogging" 429 | if os.path.exists(dir): 430 | shutil.rmtree(dir) 431 | configure(dir=dir) 432 | logkv("a", 3) 433 | logkv("b", 2.5) 434 | dumpkvs() 435 | logkv("b", -2.5) 436 | logkv("a", 5.5) 437 | dumpkvs() 438 | info("^^^ should see a = 5.5") 439 | logkv_mean("b", -22.5) 440 | logkv_mean("b", -44.4) 441 | logkv("a", 5.5) 442 | dumpkvs() 443 | info("^^^ should see b = -33.3") 444 | 445 | logkv("b", -2.5) 446 | dumpkvs() 447 | 448 | logkv("a", "longasslongasslongasslongasslongasslongassvalue") 449 | dumpkvs() 450 | 451 | 452 | # ================================================================ 453 | # Readers 454 | # ================================================================ 455 | 456 | def read_json(fname): 457 | import pandas 458 | ds = [] 459 | with open(fname, 'rt') as fh: 460 | for line in fh: 461 | ds.append(json.loads(line)) 462 | return pandas.DataFrame(ds) 463 | 464 | def read_csv(fname): 465 | import pandas 466 | return pandas.read_csv(fname, index_col=None, comment='#') 467 | 468 | def read_tb(path): 469 | """ 470 | path : a tensorboard file OR a directory, where we will find all TB files 471 | of the form events.* 472 | """ 473 | import pandas 474 | import numpy as np 475 | from glob import glob 476 | import tensorflow as tf 477 | if osp.isdir(path): 478 | fnames = glob(osp.join(path, "events.*")) 479 | elif osp.basename(path).startswith("events."): 480 | fnames = [path] 481 | else: 482 | raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s"%path) 483 | tag2pairs = defaultdict(list) 484 | maxstep = 0 485 | for fname in fnames: 486 | for summary in tf.train.summary_iterator(fname): 487 | if summary.step > 0: 488 | for v in summary.summary.value: 489 | pair = (summary.step, v.simple_value) 490 | tag2pairs[v.tag].append(pair) 491 | maxstep = max(summary.step, maxstep) 492 | data = np.empty((maxstep, len(tag2pairs))) 493 | data[:] = np.nan 494 | tags = sorted(tag2pairs.keys()) 495 | for (colidx,tag) in enumerate(tags): 496 | pairs = tag2pairs[tag] 497 | for (step, value) in pairs: 498 | data[step-1, colidx] = value 499 | return pandas.DataFrame(data, columns=tags) 500 | 501 | if __name__ == "__main__": 502 | _demo() 503 | --------------------------------------------------------------------------------