├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── assets ├── TensorBoard.gif ├── continuous.png ├── different_gait.gif ├── offline.png ├── online_il.png └── rs-mpc.gif ├── docker ├── Dockerfile └── run_docker.bash ├── rlil ├── __init__.py ├── agents │ ├── __init__.py │ ├── airl.py │ ├── base.py │ ├── bc.py │ ├── bcq.py │ ├── bear.py │ ├── brac.py │ ├── ddpg.py │ ├── gail.py │ ├── noisy_td3.py │ ├── ppo.py │ ├── rs_mpc.py │ ├── sac.py │ ├── td3.py │ ├── vac.py │ └── vae_bc.py ├── approximation │ ├── __init__.py │ ├── approximation.py │ ├── bcq_auto_encoder.py │ ├── checkpointer │ │ └── __init__.py │ ├── discriminator.py │ ├── dynamics.py │ ├── ensemble_q_continuous.py │ ├── feature_network.py │ ├── q_continuous.py │ ├── q_network.py │ ├── target │ │ ├── __init__.py │ │ ├── abstract.py │ │ ├── fixed.py │ │ ├── polyak.py │ │ └── trivial.py │ └── v_network.py ├── environments │ ├── __init__.py │ ├── action.py │ ├── base.py │ ├── data │ │ └── ant_half_front_legs.xml │ ├── gym.py │ ├── reward_fns.py │ ├── rlil_envs.py │ └── state.py ├── experiments │ ├── __init__.py │ ├── experiment.py │ └── trainer.py ├── initializer.py ├── memory │ ├── __init__.py │ ├── airl_wrapper.py │ ├── base.py │ ├── gae_wrapper.py │ ├── gail_wrapper.py │ ├── replay_buffer.py │ └── sqil_wrapper.py ├── nn │ └── __init__.py ├── policies │ ├── __init__.py │ ├── bcq_deterministic.py │ ├── deterministic.py │ ├── gaussian.py │ ├── soft_deterministic.py │ └── softmax.py ├── presets │ ├── __init__.py │ ├── continuous │ │ ├── __init__.py │ │ ├── airl.py │ │ ├── bc.py │ │ ├── bcq.py │ │ ├── bear.py │ │ ├── brac.py │ │ ├── ddpg.py │ │ ├── gail.py │ │ ├── models.py │ │ ├── noisy_td3.py │ │ ├── ppo.py │ │ ├── rs_mpc.py │ │ ├── sac.py │ │ ├── sqil.py │ │ ├── td3.py │ │ ├── vac.py │ │ └── vae_bc.py │ └── validate_agent.py ├── samplers │ ├── __init__.py │ ├── asyncsampler.py │ └── base.py └── utils │ ├── __init__.py │ ├── plots.py │ └── writer.py ├── runs └── .gitignore ├── scripts ├── __init__.py ├── continuous │ ├── offline.py │ ├── online.py │ ├── online_il.py │ └── watch_continuous.py ├── offline_continuous.bash ├── online_continuous.bash ├── online_il_continuous.bash ├── plot.py └── record_trajectory.py ├── setup.py └── tests ├── __init__.py ├── agents └── __init__.py ├── approximation ├── __init__.py ├── bcq_encoder_test.py ├── dynamics_test.py ├── ensemble_q_continuous_test.py ├── feature_network_test.py ├── q_network_test.py └── v_network_test.py ├── benchmark ├── __init__.py ├── action_test.py ├── cpu_gpu_test.py ├── state_test.py └── train_test.py ├── conftest.py ├── environments ├── __init__.py ├── action_test.py ├── gym_test.py └── state_test.py ├── experiments ├── __init__.py ├── experiment_test.py └── trainer_test.py ├── memory ├── __init__.py ├── airl_wrapper_test.py ├── gae_wrapper_test.py ├── gail_wrapper_test.py ├── replay_buffer_test.py └── sqil_wrapper_test.py ├── mock_agent.py ├── nn ├── __init__.py └── nn_test.py ├── policies ├── __init__.py ├── bcq_deterministic_test.py ├── deterministic_test.py ├── gaussian_test.py ├── soft_deterministic_test.py └── softmax_test.py ├── presets ├── __init__.py ├── offline_continuous_test.py ├── online_continuous_test.py └── online_il_continuous_test.py ├── samplers ├── __init__.py └── asyncsampler_test.py └── utils └── writer ├── __init__.py └── writer_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # python 2 | *.pyc 3 | __pycache__ 4 | 5 | # build directories 6 | /build 7 | /dist 8 | heavy_runs 9 | scripts/runs/ 10 | 11 | # editor 12 | .vscode 13 | .idea 14 | 15 | # non-committed code 16 | local 17 | legacy 18 | /out 19 | /others 20 | 21 | # Byte-compiled / optimized / DLL files 22 | __pycache__/ 23 | *.py[cod] 24 | *$py.class 25 | 26 | # C extensions 27 | *.so 28 | 29 | # Distribution / packaging 30 | .Python 31 | build/ 32 | develop-eggs/ 33 | dist/ 34 | downloads/ 35 | eggs/ 36 | .eggs/ 37 | lib/ 38 | lib64/ 39 | parts/ 40 | sdist/ 41 | var/ 42 | wheels/ 43 | pip-wheel-metadata/ 44 | share/python-wheels/ 45 | *.egg-info/ 46 | .installed.cfg 47 | *.egg 48 | MANIFEST 49 | 50 | # PyInstaller 51 | # Usually these files are written by a python script from a template 52 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 53 | *.manifest 54 | *.spec 55 | 56 | # Installer logs 57 | pip-log.txt 58 | pip-delete-this-directory.txt 59 | 60 | # Unit test / coverage reports 61 | htmlcov/ 62 | .tox/ 63 | .nox/ 64 | .coverage 65 | .coverage.* 66 | .cache 67 | nosetests.xml 68 | coverage.xml 69 | *.cover 70 | *.py,cover 71 | .hypothesis/ 72 | .pytest_cache/ 73 | cover/ 74 | 75 | # Translations 76 | *.mo 77 | *.pot 78 | 79 | # Django stuff: 80 | *.log 81 | local_settings.py 82 | db.sqlite3 83 | db.sqlite3-journal 84 | 85 | # Flask stuff: 86 | instance/ 87 | .webassets-cache 88 | 89 | # Scrapy stuff: 90 | .scrapy 91 | 92 | # Sphinx documentation 93 | docs/_build/ 94 | 95 | # Jupyter Notebook 96 | .ipynb_checkpoints 97 | 98 | # IPython 99 | profile_default/ 100 | ipython_config.py 101 | 102 | # pyenv 103 | # For a library or package, you might want to ignore these files since the code is 104 | # intended to run in multiple environments; otherwise, check them in: 105 | # .python-version 106 | 107 | # pipenv 108 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 109 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 110 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 111 | # install all needed dependencies. 112 | #Pipfile.lock 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Toshinori Kitamura 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | install: 2 | conda install torch torchvision 3 | pip install tensorboard 4 | pip install -e . 5 | 6 | test: 7 | pytest -v --benchmark-skip 8 | 9 | benchmark: 10 | pytest -v --benchmark-only 11 | 12 | autopep8: 13 | autopep8 --in-place --recursive . 14 | 15 | tensorboard: 16 | tensorboard --logdir runs 17 | 18 | clean: 19 | rm -rf dist 20 | rm -rf build 21 | 22 | build: clean 23 | python setup.py sdist bdist_wheel 24 | 25 | deploy: lint test build 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /assets/TensorBoard.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/assets/TensorBoard.gif -------------------------------------------------------------------------------- /assets/continuous.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/assets/continuous.png -------------------------------------------------------------------------------- /assets/different_gait.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/assets/different_gait.gif -------------------------------------------------------------------------------- /assets/offline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/assets/offline.png -------------------------------------------------------------------------------- /assets/online_il.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/assets/online_il.png -------------------------------------------------------------------------------- /assets/rs-mpc.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/assets/rs-mpc.gif -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # This Dockerfile is based on https://github.com/ikeyasu/docker-reinforcement-learning 2 | 3 | # To use cuda9.2 container, you need to install nvidia-driver >= 396.26 4 | # See https://github.com/NVIDIA/nvidia-docker/wiki/CUDA#requirements 5 | FROM syuntoku/rl_ws:latest 6 | MAINTAINER syuntoku14 7 | 8 | RUN git clone git@github.com:syuntoku14/pytorch-rl-il.git 9 | RUN cd pytorch-rl-il && pip install -e . 10 | 11 | CMD ["bash"] 12 | WORKDIR /root -------------------------------------------------------------------------------- /docker/run_docker.bash: -------------------------------------------------------------------------------- 1 | # umask 0002 is to change the permission to a normal user 2 | 3 | run_docker() { 4 | docker run --rm -it \ 5 | -p 6080:6080 \ 6 | -p 8888:8888 \ 7 | -p 6006:6006 \ 8 | -p 5678:5678 \ 9 | -p 8265:8265 \ 10 | -v ~/RL_ws:/root/RL_ws \ 11 | -v ~/pytorch-rl-il:/root/pytorch-rl-il \ 12 | -e DISPLAY=:0 \ 13 | --name rl \ 14 | --shm-size 256G \ 15 | --entrypoint "" \ 16 | syuntoku/rl_ws:rlil bash -c "umask 0002 && bash" 17 | } 18 | 19 | run_docker_gpu() { 20 | docker run --rm -it \ 21 | -p 6080:6080 \ 22 | -p 8888:8888 \ 23 | -p 6006:6006 \ 24 | -p 5678:5678 \ 25 | -p 8265:8265 \ 26 | -v ~/RL_ws:/root/RL_ws \ 27 | -v ~/pytorch-rl-il:/root/pytorch-rl-il \ 28 | -e DISPLAY=:0 \ 29 | --name rl \ 30 | --shm-size 256G \ 31 | -e NVIDIA_VISIBLE_DEVICES=all \ 32 | -e NVIDIA_DRIVER_CAPABILITIES=all \ 33 | --gpus=all \ 34 | --entrypoint "" \ 35 | syuntoku/rl_ws:rlil bash -c "umask 0002 && bash" 36 | } 37 | 38 | getopts "n" OPT 39 | case $OPT in 40 | n ) echo "--runtime=nvidia" 41 | run_docker_gpu ;; 42 | ? ) echo "Without gpu" 43 | run_docker ;; 44 | esac 45 | -------------------------------------------------------------------------------- /rlil/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/rlil/__init__.py -------------------------------------------------------------------------------- /rlil/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Agent, LazyAgent 2 | from .vac import VAC 3 | from .ddpg import DDPG 4 | from .sac import SAC 5 | from .td3 import TD3 6 | from .noisy_td3 import NoisyTD3 7 | from .bcq import BCQ 8 | from .bear import BEAR 9 | from .brac import BRAC 10 | from .bc import BC 11 | from .vae_bc import VaeBC 12 | from .ppo import PPO 13 | from .gail import GAIL 14 | from .airl import AIRL 15 | from .rs_mpc import RsMPC 16 | 17 | __all__ = [ 18 | "Agent", 19 | "LazyAgent", 20 | "VAC", 21 | "DDPG", 22 | "SAC", 23 | "TD3", 24 | "NoisyTD3", 25 | "BCQ", 26 | "BEAR", 27 | "BRAC", 28 | "BC", 29 | "VaeBC", 30 | "PPO", 31 | "GAIL", 32 | "AIRL", 33 | "RsMPC" 34 | ] 35 | -------------------------------------------------------------------------------- /rlil/agents/airl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from rlil.initializer import get_device, get_writer, get_replay_buffer 3 | from rlil import nn 4 | from .gail import GAIL 5 | 6 | 7 | class AIRL(GAIL): 8 | """ 9 | Adversarial inverse reinforcement learning (AIRL) 10 | 11 | AIRL is an inverse reinforcement learning algorithm based on 12 | adversarial learning. AIRL trains not only the reward function 13 | but also the value function to make the reward function robust 14 | to changes in dynamics. 15 | 16 | Args: 17 | base_agent (rlil.agent.Agent): 18 | An off-policy agent such as ddpg, td3, sac 19 | minibatch_size (int): 20 | The number of experiences to sample in each discriminator update. 21 | replay_start_size (int): Number of experiences in replay buffer when training begins. 22 | update_frequency (int): Number of base_agent update per discriminator update 23 | """ 24 | 25 | def __init__(self, 26 | base_agent, 27 | minibatch_size=32, 28 | replay_start_size=5000, 29 | update_frequency=10, 30 | ): 31 | # objects 32 | self.base_agent = base_agent 33 | self.replay_buffer = get_replay_buffer() 34 | self.reward_fn = self.replay_buffer.reward_fn 35 | self.value_fn = self.replay_buffer.value_fn 36 | self.writer = get_writer() 37 | self.device = get_device() 38 | self.discrim_criterion = nn.BCELoss() 39 | # hyperparameters 40 | self.minibatch_size = minibatch_size 41 | self.replay_start_size = replay_start_size 42 | self.update_frequency = update_frequency 43 | self._train_count = 0 44 | 45 | def train(self): 46 | # train discriminator 47 | if self.should_train(): 48 | samples, expert_samples = self.replay_buffer.sample_both( 49 | self.minibatch_size) 50 | states, actions, _, next_states, _, _ = samples 51 | exp_states, exp_actions, _, exp_next_states, _, _ = expert_samples 52 | 53 | fake = self.replay_buffer.discrim(states, actions, next_states) 54 | real = self.replay_buffer.discrim(exp_states, 55 | exp_actions, 56 | exp_next_states) 57 | discrim_loss = self.discrim_criterion(fake, torch.ones_like(fake)) + \ 58 | self.discrim_criterion(real, torch.zeros_like(real)) 59 | 60 | self.reward_fn.zero_grad() 61 | self.value_fn.zero_grad() 62 | discrim_loss.backward() 63 | self.reward_fn.reinforce() 64 | self.value_fn.reinforce() 65 | 66 | # additional debugging info 67 | self.writer.add_scalar('airl/fake', fake.mean()) 68 | self.writer.add_scalar('airl/real', real.mean()) 69 | 70 | # train base_agent 71 | self.base_agent.train() 72 | -------------------------------------------------------------------------------- /rlil/agents/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from rlil.memory import ExperienceReplayBuffer 3 | from rlil.initializer import get_n_step 4 | from rlil.utils import Samples 5 | 6 | 7 | class Agent(ABC): 8 | """ 9 | Abstract agent class 10 | """ 11 | 12 | @abstractmethod 13 | def act(self, state, reward=None): 14 | """ 15 | Select an action for evaluation. 16 | If the agent has a replay-buffer, state and reward are stored. 17 | 18 | Args: 19 | state (rlil.environment.State): The environment state at the current timestep. 20 | reward (torch.Tensor): The reward from the previous timestep. 21 | 22 | Returns: 23 | rllib.Action: The action to take at the current timestep. 24 | """ 25 | 26 | @abstractmethod 27 | def make_lazy_agent(self, evaluation=False): 28 | """ 29 | Return a LazyAgent object for sampling or evaluation. 30 | 31 | Args: 32 | evaluation (bool, optional): If evaluation==True, the returned 33 | object act greedily. Defaults to False. 34 | 35 | Returns: 36 | LazyAgent: The LazyAgent object for Sampler. 37 | """ 38 | pass 39 | 40 | def train(self): 41 | """ 42 | Update internal parameters 43 | """ 44 | pass 45 | 46 | def load(self, dirname): 47 | """ 48 | Load pretrained agent. 49 | 50 | Args: 51 | dirname (str): Directory where the agent saved 52 | """ 53 | pass 54 | 55 | 56 | class LazyAgent(ABC): 57 | """ 58 | Agent class for Sampler. 59 | """ 60 | 61 | def __init__(self, 62 | evaluation=False, 63 | store_samples=True): 64 | self._states = None 65 | self._actions = None 66 | self._evaluation = evaluation 67 | self._store_samples = store_samples 68 | self.replay_buffer = None 69 | # for N step replay buffer 70 | self._n_step, self._discount_factor = get_n_step() 71 | if self._evaluation: 72 | self._n_step = 1 # disable Nstep buffer when evaluation mode 73 | 74 | def set_replay_buffer(self, env): 75 | self.replay_buffer = ExperienceReplayBuffer( 76 | 1e7, env, n_step=self._n_step, 77 | discount_factor=self._discount_factor) 78 | 79 | def act(self, states, reward): 80 | """ 81 | In the act function, the lazy_agent put a sample 82 | (last_state, last_action, reward, states) into self.replay_buffer. 83 | Then, it outputs a corresponding action. 84 | """ 85 | if self._store_samples: 86 | assert self.replay_buffer is not None, \ 87 | "Call self.set_replay_buffer(env) at lazy_agent initialization." 88 | samples = Samples(self._states, self._actions, reward, states) 89 | self.replay_buffer.store(samples) 90 | 91 | def compute_priorities(self, samples): 92 | """ 93 | Compute priorities of the given samples. 94 | This method is useful for Apex implementation. 95 | Args: 96 | samples (rlil.utils.Samples) 97 | """ 98 | return None 99 | -------------------------------------------------------------------------------- /rlil/agents/bc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.distributions.normal import Normal 3 | from torch.nn.functional import mse_loss 4 | from rlil.environments import State, action_decorator, Action 5 | from rlil.initializer import get_device, get_writer, get_replay_buffer 6 | from rlil import nn 7 | from copy import deepcopy 8 | from .base import Agent, LazyAgent 9 | import os 10 | 11 | 12 | class BC(Agent): 13 | """ 14 | Behavioral Cloning (BC) 15 | 16 | In behavioral cloning, the agent trains a classifier or regressor to 17 | replicate the expert's policy using the training data 18 | both the encountered states and actions. 19 | 20 | Args: 21 | policy (DeterministicPolicy): 22 | An Approximation of a deterministic policy. 23 | minibatch_size (int): 24 | The number of experiences to sample in each training update. 25 | """ 26 | 27 | def __init__(self, 28 | policy, 29 | minibatch_size=32, 30 | ): 31 | # objects 32 | self.policy = policy 33 | self.replay_buffer = get_replay_buffer() 34 | self.writer = get_writer() 35 | self.device = get_device() 36 | # hyperparameters 37 | self.minibatch_size = minibatch_size 38 | 39 | def act(self, states, reward): 40 | self._states = states 41 | self._actions = Action(self.policy.eval(states.to(self.device))) 42 | return self._actions 43 | 44 | def train(self): 45 | if self.should_train(): 46 | (states, actions, _, _, _, _) = self.replay_buffer.sample( 47 | self.minibatch_size) 48 | policy_actions = Action(self.policy(states)) 49 | loss = mse_loss(policy_actions.features, actions.features) 50 | self.policy.reinforce(loss) 51 | self.writer.train_steps += 1 52 | 53 | def should_train(self): 54 | return True 55 | 56 | def make_lazy_agent(self, *args, **kwargs): 57 | model = deepcopy(self.policy.model) 58 | return BCLazyAgent(model.to("cpu"), *args, **kwargs) 59 | 60 | def load(self, dirname): 61 | for filename in os.listdir(dirname): 62 | if filename == 'policy.pt': 63 | self.policy.model = torch.load(os.path.join( 64 | dirname, filename), map_location=self.device) 65 | 66 | 67 | class BCLazyAgent(LazyAgent): 68 | """ 69 | Agent class for sampler. 70 | """ 71 | 72 | def __init__(self, policy_model, *args, **kwargs): 73 | self._policy_model = policy_model 74 | super().__init__(*args, **kwargs) 75 | 76 | def act(self, states, reward): 77 | super().act(states, reward) 78 | self._states = states 79 | with torch.no_grad(): 80 | self._actions = Action(self._policy_model(states)) 81 | return self._actions 82 | -------------------------------------------------------------------------------- /rlil/agents/gail.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from rlil.initializer import get_device, get_writer, get_replay_buffer 3 | from rlil import nn 4 | from .base import Agent 5 | 6 | 7 | class GAIL(Agent): 8 | """ 9 | Generative adversarial imitation learning (GAIL) 10 | 11 | GAIL is composed of two neural networks, the policy (generator) network 12 | and the discriminator network. In the original paper (https://arxiv.org/abs/1606.03476), 13 | the policy network is trained using TRPO. 14 | 15 | Args: 16 | base_agent (rlil.agent.Agent): Agent to train the policy. 17 | minibatch_size (int): 18 | The number of experiences to sample in each discriminator update. 19 | replay_start_size (int): Number of experiences in replay buffer when training begins. 20 | update_frequency (int): Number of base_agent update per discriminator update 21 | """ 22 | 23 | def __init__(self, 24 | base_agent, 25 | minibatch_size=32, 26 | replay_start_size=5000, 27 | update_frequency=10, 28 | ): 29 | # objects 30 | self.base_agent = base_agent 31 | self.replay_buffer = get_replay_buffer() 32 | self.discriminator = self.replay_buffer.discriminator 33 | self.writer = get_writer() 34 | self.device = get_device() 35 | self.discrim_criterion = nn.BCELoss() 36 | # hyperparameters 37 | self.minibatch_size = minibatch_size 38 | self.replay_start_size = replay_start_size 39 | self.update_frequency = update_frequency 40 | self._train_count = 0 41 | 42 | def act(self, *args, **kwargs): 43 | return self.base_agent.act(*args, **kwargs) 44 | 45 | def train(self): 46 | self._train_count += 1 47 | # train discriminator 48 | if self.should_train(): 49 | samples, expert_samples = self.replay_buffer.sample_both( 50 | self.minibatch_size) 51 | states, actions, _, _, _, _ = samples 52 | exp_states, exp_actions, _, _, _, _ = expert_samples 53 | 54 | fake = self.discriminator( 55 | torch.cat((states.features, actions.features), dim=1)) 56 | real = self.discriminator( 57 | torch.cat((exp_states.features, exp_actions.features), dim=1)) 58 | discrim_loss = self.discrim_criterion(fake, torch.ones_like(fake)) + \ 59 | self.discrim_criterion(real, torch.zeros_like(real)) 60 | self.discriminator.reinforce(discrim_loss) 61 | 62 | # additional debugging info 63 | self.writer.add_scalar('gail/fake', fake.mean()) 64 | self.writer.add_scalar('gail/real', real.mean()) 65 | 66 | # train base_agent 67 | self.base_agent.train() 68 | 69 | def should_train(self): 70 | return len(self.replay_buffer) > self.replay_start_size and \ 71 | self._train_count % self.update_frequency == 0 72 | 73 | def make_lazy_agent(self, *args, **kwargs): 74 | return self.base_agent.make_lazy_agent(*args, **kwargs) 75 | 76 | def load(self, dirname): 77 | self.base_agent.load(dirname) 78 | -------------------------------------------------------------------------------- /rlil/agents/noisy_td3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from copy import deepcopy 4 | from torch.distributions.normal import Normal 5 | from rlil.environments import Action 6 | from rlil.initializer import get_device, get_writer, get_replay_buffer 7 | from rlil.memory import ExperienceReplayBuffer 8 | from rlil import nn 9 | from rlil.utils import Samples 10 | from .td3 import TD3, LazyAgent 11 | 12 | 13 | class NoisyTD3(TD3): 14 | """ 15 | Twin Dueling DDPG (TD3) with noisy network. 16 | TD3: https://arxiv.org/abs/1802.09477 17 | Noisy Network: https://arxiv.org/abs/1706.10295 18 | 19 | Args: 20 | q_1 (QContinuous): An Approximation of the continuous action Q-function. 21 | q_2 (QContinuous): An Approximation of the continuous action Q-function. 22 | policy (DeterministicPolicy): An Approximation of a deterministic policy. 23 | discount_factor (float): Discount factor for future rewards. 24 | minibatch_size (int): The number of experiences to sample in each training update. 25 | noise_td3 (float): the amount of noise to add to each action in trick three. 26 | policy_update_td3 (int): Number of timesteps per training update the policy in trick two. 27 | replay_start_size (int): Number of experiences in replay buffer when training begins. 28 | """ 29 | 30 | def __init__(self, 31 | q_1, 32 | q_2, 33 | policy, 34 | discount_factor=0.99, 35 | minibatch_size=32, 36 | noise_td3=0.2, 37 | policy_update_td3=2, 38 | replay_start_size=5000, 39 | ): 40 | # objects 41 | self.q_1 = q_1 42 | self.q_2 = q_2 43 | self.policy = policy 44 | self.replay_buffer = get_replay_buffer() 45 | self.device = get_device() 46 | self.writer = get_writer() 47 | # hyperparameters 48 | self.replay_start_size = replay_start_size 49 | self.minibatch_size = minibatch_size 50 | self.discount_factor = discount_factor 51 | self._noise_td3 = Normal( 52 | 0, noise_td3*torch.tensor( 53 | (Action.action_space().high - Action.action_space().low) / 2, 54 | dtype=torch.float32, device=self.device)) 55 | 56 | self._policy_update_td3 = policy_update_td3 57 | self._states = None 58 | self._actions = None 59 | self._train_count = 0 60 | 61 | def act(self, states, reward=None): 62 | if reward is not None: 63 | samples = Samples(self._states, self._actions, reward, states) 64 | self.replay_buffer.store(samples) 65 | self._states = states 66 | actions = self.policy.no_grad(states.to(self.device)) 67 | self._actions = Action(actions).to("cpu") 68 | return self._actions 69 | 70 | def make_lazy_agent(self, 71 | evaluation=False, 72 | store_samples=True): 73 | model = deepcopy(self.policy.model) 74 | model.apply(nn.perturb_noisy_layers) 75 | return NoisyTD3LazyAgent(model.to("cpu"), 76 | evaluation=evaluation, 77 | store_samples=store_samples) 78 | 79 | 80 | class NoisyTD3LazyAgent(LazyAgent): 81 | """ 82 | Agent class for sampler. 83 | """ 84 | 85 | def __init__(self, 86 | policy_model, 87 | *args, 88 | **kwargs): 89 | self._policy_model = policy_model 90 | super().__init__(*args, **kwargs) 91 | if self._evaluation: 92 | self._policy_model.eval() 93 | 94 | def act(self, states, reward): 95 | super().act(states, reward) 96 | self._states = states 97 | with torch.no_grad(): 98 | actions = self._policy_model(states) 99 | self._actions = Action(actions) 100 | return self._actions 101 | -------------------------------------------------------------------------------- /rlil/agents/vae_bc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.distributions.normal import Normal 3 | from torch.nn.functional import mse_loss 4 | from rlil.environments import State, action_decorator, Action 5 | from rlil.initializer import get_device, get_writer, get_replay_buffer 6 | from rlil import nn 7 | from copy import deepcopy 8 | from .base import Agent, LazyAgent 9 | import os 10 | 11 | 12 | class VaeBC(Agent): 13 | """ 14 | VAE Behavioral Cloning (VAE-BC) 15 | 16 | VaeBC is a behavioral cloning method used in BCQ, BEAR and BRAC. 17 | It replaces the NN regressor in BC implementation with a VAE. 18 | This code is mainly for debugging. 19 | 20 | Args: 21 | encoder (BcqEncoder): An approximation of the encoder. 22 | decoder (BcqDecoder): An approximation of the decoder. 23 | minibatch_size (int): 24 | The number of experiences to sample in each training update. 25 | """ 26 | 27 | def __init__(self, 28 | encoder, 29 | decoder, 30 | minibatch_size=100, 31 | ): 32 | # objects 33 | self.encoder = encoder 34 | self.decoder = decoder 35 | self.replay_buffer = get_replay_buffer() 36 | self.writer = get_writer() 37 | self.device = get_device() 38 | # hyperparameters 39 | self.minibatch_size = minibatch_size 40 | 41 | def act(self, states, reward): 42 | # batch x num_decode x d 43 | vae_actions, _ = \ 44 | self.decoder.decode_multiple(states.to(self.device), num_decode=10) 45 | # batch x d 46 | vae_actions = vae_actions.mean(1) 47 | return Action(vae_actions) 48 | 49 | def train(self): 50 | (states, actions, _, _, _, _) = self.replay_buffer.sample( 51 | self.minibatch_size) 52 | 53 | # train vae 54 | mean, log_var = self.encoder( 55 | states.to(self.device), actions.to(self.device)) 56 | z = mean + (0.5 * log_var).exp() * torch.randn_like(log_var) 57 | vae_actions = Action(self.decoder(states, z)) 58 | vae_mse = mse_loss(actions.features, vae_actions.features) 59 | vae_kl = nn.kl_loss_vae(mean, log_var) 60 | vae_loss = vae_mse + vae_kl 61 | self.decoder.reinforce(vae_loss) 62 | self.encoder.reinforce() 63 | self.writer.add_scalar('loss/vae/mse', vae_mse.detach()) 64 | self.writer.add_scalar('loss/vae/kl', vae_kl.detach()) 65 | self.writer.train_steps += 1 66 | 67 | def should_train(self): 68 | return True 69 | 70 | def make_lazy_agent(self, *args, **kwargs): 71 | decoder_model = deepcopy(self.decoder.model) 72 | return VaeBcLazyAgent(decoder_model.to("cpu"), *args, **kwargs) 73 | 74 | def load(self, dirname): 75 | for filename in os.listdir(dirname): 76 | if filename in ('encoder.pt'): 77 | self.encoder.model = torch.load(os.path.join(dirname, filename), 78 | map_location=self.device) 79 | if filename in ('decoder.pt'): 80 | self.decoder.model = torch.load(os.path.join(dirname, filename), 81 | map_location=self.device) 82 | 83 | 84 | class VaeBcLazyAgent(LazyAgent): 85 | """ 86 | Agent class for sampler. 87 | """ 88 | 89 | def __init__(self, decoder_model, *args, **kwargs): 90 | self._decoder_model = decoder_model 91 | super().__init__(*args, **kwargs) 92 | 93 | def act(self, states, reward): 94 | super().act(states, reward) 95 | self._states = states 96 | with torch.no_grad(): 97 | # batch x num_decode x d 98 | actions, _ = \ 99 | self._decoder_model.decode_multiple(states, num_decode=10) 100 | # batch x d 101 | self._actions = Action(actions.mean(1)) 102 | return self._actions 103 | -------------------------------------------------------------------------------- /rlil/approximation/__init__.py: -------------------------------------------------------------------------------- 1 | from .approximation import Approximation 2 | from .ensemble_q_continuous import EnsembleQContinuous 3 | from .q_continuous import QContinuous 4 | from .q_network import QNetwork 5 | from .v_network import VNetwork 6 | from .bcq_auto_encoder import BcqEncoder, BcqDecoder 7 | from .discriminator import Discriminator 8 | from .target import TargetNetwork, FixedTarget, PolyakTarget, TrivialTarget 9 | from .checkpointer import Checkpointer, DummyCheckpointer, PeriodicCheckpointer 10 | from .feature_network import FeatureNetwork 11 | from .dynamics import Dynamics 12 | 13 | 14 | __all__ = [ 15 | "Approximation", 16 | "EnsembleQContinuous", 17 | "QContinuous", 18 | "QNetwork", 19 | "VNetwork", 20 | "BcqEncoder", 21 | "BcqDecoder", 22 | "Discriminator", 23 | "TargetNetwork", 24 | "FixedTarget", 25 | "PolyakTarget", 26 | "TrivialTarget", 27 | "Checkpointer", 28 | "DummyCheckpointer", 29 | "PeriodicCheckpointer", 30 | "FeatureNetwork", 31 | "Dynamics" 32 | ] 33 | -------------------------------------------------------------------------------- /rlil/approximation/checkpointer/__init__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from abc import abstractmethod, ABC 3 | import torch 4 | import os 5 | from rlil.initializer import get_writer 6 | 7 | 8 | class Checkpointer(ABC): 9 | @abstractmethod 10 | def init(self, model, filename): 11 | pass 12 | 13 | @abstractmethod 14 | def __call__(self): 15 | pass 16 | 17 | 18 | class DummyCheckpointer(Checkpointer): 19 | def init(self, *inputs): 20 | pass 21 | 22 | def __call__(self): 23 | pass 24 | 25 | 26 | class PeriodicCheckpointer(Checkpointer): 27 | def __init__(self, frequency): 28 | self.frequency = frequency 29 | self._writer = get_writer() 30 | self._log_dir = None 31 | self._filename = None 32 | self._model = None 33 | 34 | def init(self, model, log_dir, filename): 35 | self._model = model 36 | self._log_dir = log_dir 37 | self._filename = filename 38 | # Some builds of pytorch throw this unhelpful warning. 39 | # We can safely disable it. 40 | # https://discuss.pytorch.org/t/got-warning-couldnt-retrieve-source-code-for-container/7689/7 41 | warnings.filterwarnings( 42 | "ignore", message="Couldn't retrieve source code") 43 | 44 | def __call__(self): 45 | # save pereodically 46 | # if self._writer.train_steps % self.frequency == 0: 47 | # save_dir = os.path.join(self._log_dir, str(self._writer.train_steps)) 48 | # if not os.path.exists(save_dir): 49 | # os.makedirs(save_dir) 50 | # torch.save(self._model, os.path.join( 51 | # save_dir, self._filename) + ".pt") 52 | 53 | if self._writer.train_steps % self.frequency == 0: 54 | torch.save(self._model, 55 | os.path.join(self._log_dir, self._filename + ".pt")) 56 | -------------------------------------------------------------------------------- /rlil/approximation/discriminator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from rlil.nn import RLNetwork 3 | from .approximation import Approximation 4 | 5 | 6 | class Discriminator(Approximation): 7 | def __init__( 8 | self, 9 | model, 10 | optimizer, 11 | name='discriminator', 12 | **kwargs 13 | ): 14 | model = DiscriminatorModule(model) 15 | super().__init__( 16 | model, 17 | optimizer, 18 | name=name, 19 | **kwargs 20 | ) 21 | 22 | def expert_reward(self, features): 23 | rew = torch.log(self.model(features)) - \ 24 | torch.log(1 - self.model(features)) 25 | return rew.squeeze().detach() 26 | 27 | 28 | class DiscriminatorModule(RLNetwork): 29 | def forward(self, features): 30 | return self.model(features) 31 | -------------------------------------------------------------------------------- /rlil/approximation/dynamics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from rlil.environments import State 3 | from rlil.nn import RLNetwork 4 | from .approximation import Approximation 5 | 6 | 7 | class Dynamics(Approximation): 8 | def __init__( 9 | self, 10 | model, 11 | optimizer, 12 | name='dynamics', 13 | **kwargs 14 | ): 15 | model = DynamicsModule(model) 16 | super().__init__( 17 | model, 18 | optimizer, 19 | name=name, 20 | **kwargs 21 | ) 22 | 23 | 24 | class DynamicsModule(RLNetwork): 25 | def forward(self, states, actions): 26 | x = torch.cat((states.features.float(), 27 | actions.features.float()), dim=1) 28 | diff_features = self.model(x) 29 | next_features = states.features + diff_features 30 | 31 | return State( 32 | next_features, 33 | mask=states.mask, 34 | info=states.info 35 | ) 36 | -------------------------------------------------------------------------------- /rlil/approximation/ensemble_q_continuous.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from rlil.nn import RLNetwork 3 | from .approximation import Approximation 4 | 5 | 6 | class EnsembleQContinuous(Approximation): 7 | def __init__( 8 | self, 9 | models: torch.nn.ModuleList, 10 | optimizer, 11 | name='ensemble_q', 12 | **kwargs 13 | ): 14 | model = EnsembleQContinuousModule(models) 15 | super().__init__( 16 | model, 17 | optimizer, 18 | name=name, 19 | **kwargs 20 | ) 21 | 22 | def q1(self, *args, **kwargs): 23 | return self.model.q1(*args, **kwargs) 24 | 25 | 26 | class EnsembleQContinuousModule(RLNetwork): 27 | def forward(self, states, actions): 28 | all_qs = [] 29 | x = torch.cat((states.features.float(), 30 | actions.features.float()), dim=1) 31 | for m in self.model: 32 | all_qs.append((m(x).squeeze(-1) 33 | * states.mask.float()).unsqueeze(1)) 34 | all_qs = torch.cat(all_qs, dim=1) 35 | return all_qs # batch x num_q 36 | 37 | def q1(self, states, actions): 38 | x = torch.cat((states.features.float(), 39 | actions.features.float()), dim=1) 40 | return self.model[0](x).squeeze(-1) * states.mask.float() 41 | -------------------------------------------------------------------------------- /rlil/approximation/feature_network.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from rlil.environments import State 3 | from .approximation import Approximation 4 | 5 | 6 | class FeatureNetwork(Approximation): 7 | ''' 8 | A special type of Approximation that accumulates gradients before backpropagating them. 9 | This is useful when features are shared between network heads. 10 | 11 | The __call__ function caches the computation graph and detaches the output. 12 | Then, various functions approximators may backpropagate to the output. 13 | The reinforce() function will then backpropagate the accumulated gradients on the output 14 | through the original computation graph. 15 | ''' 16 | 17 | def __init__(self, model, optimizer=None, name='feature', **kwargs): 18 | model = FeatureModule(model) 19 | super().__init__(model, optimizer, name=name, **kwargs) 20 | self._cache = [] 21 | self._out = [] 22 | 23 | def __call__(self, states): 24 | ''' 25 | Run a forward pass of the model and return the detached output. 26 | 27 | Args: 28 | state (all.environment.State): An environment State 29 | 30 | Returns: 31 | all.environment.State: An enviornment State with the computed features 32 | ''' 33 | features = self.model(states) 34 | graphs = features.raw 35 | features._raw = graphs.detach() 36 | features._raw.requires_grad = True 37 | self._enqueue(graphs, features._raw) 38 | return features 39 | 40 | def reinforce(self): 41 | ''' 42 | Backward pass of the model. 43 | ''' 44 | self._optimizer.zero_grad() 45 | graphs, grads = self._dequeue() 46 | graphs.backward(grads) 47 | self.step() 48 | 49 | def _enqueue(self, features, out): 50 | self._cache.append(features) 51 | self._out.append(out) 52 | 53 | def _dequeue(self): 54 | graphs = [] 55 | grads = [] 56 | for graph, out in zip(self._cache, self._out): 57 | if out.grad is not None: 58 | graphs.append(graph) 59 | grads.append(out.grad) 60 | self._cache = [] 61 | self._out = [] 62 | return torch.cat(graphs), torch.cat(grads) 63 | 64 | 65 | class FeatureModule(torch.nn.Module): 66 | def __init__(self, model): 67 | super().__init__() 68 | self.model = model 69 | 70 | def forward(self, states): 71 | features = self.model(states.features.float()) 72 | return State( 73 | features, 74 | mask=states.mask, 75 | info=states.info 76 | ) 77 | -------------------------------------------------------------------------------- /rlil/approximation/q_continuous.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from rlil.nn import RLNetwork 3 | from .approximation import Approximation 4 | 5 | 6 | class QContinuous(Approximation): 7 | def __init__( 8 | self, 9 | model, 10 | optimizer, 11 | name='q', 12 | **kwargs 13 | ): 14 | model = QContinuousModule(model) 15 | super().__init__( 16 | model, 17 | optimizer, 18 | name=name, 19 | **kwargs 20 | ) 21 | 22 | 23 | class QContinuousModule(RLNetwork): 24 | def forward(self, states, actions): 25 | x = torch.cat((states.features.float(), 26 | actions.features.float()), dim=1) 27 | return self.model(x).squeeze(-1) * states.mask.float() 28 | -------------------------------------------------------------------------------- /rlil/approximation/q_network.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from rlil.nn import RLNetwork 3 | from .approximation import Approximation 4 | 5 | 6 | class QNetwork(Approximation): 7 | def __init__( 8 | self, 9 | model, 10 | optimizer, 11 | name='q', 12 | **kwargs 13 | ): 14 | model = QModule(model) 15 | super().__init__( 16 | model, 17 | optimizer, 18 | name=name, 19 | **kwargs 20 | ) 21 | 22 | 23 | class QModule(RLNetwork): 24 | def forward(self, states, actions=None): 25 | values = super().forward(states) 26 | if actions is None: 27 | return values 28 | return values.gather(1, actions.features.view(-1, 1)).squeeze(1) 29 | -------------------------------------------------------------------------------- /rlil/approximation/target/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstract import TargetNetwork 2 | from .fixed import FixedTarget 3 | from .polyak import PolyakTarget 4 | from .trivial import TrivialTarget 5 | -------------------------------------------------------------------------------- /rlil/approximation/target/abstract.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod, ABC 2 | 3 | 4 | class TargetNetwork(ABC): 5 | @abstractmethod 6 | def __call__(self, *inputs): 7 | pass 8 | 9 | @abstractmethod 10 | def init(self, model): 11 | pass 12 | 13 | @abstractmethod 14 | def update(self): 15 | pass 16 | -------------------------------------------------------------------------------- /rlil/approximation/target/fixed.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | from .abstract import TargetNetwork 4 | 5 | 6 | class FixedTarget(TargetNetwork): 7 | def __init__(self, update_frequency): 8 | self._source = None 9 | self._target = None 10 | self._updates = 0 11 | self._update_frequency = update_frequency 12 | 13 | def __call__(self, *inputs): 14 | with torch.no_grad(): 15 | return self._target(*inputs) 16 | 17 | def init(self, model): 18 | self._source = model 19 | self._target = copy.deepcopy(model) 20 | 21 | def update(self): 22 | self._updates += 1 23 | if self._should_update(): 24 | self._target.load_state_dict(self._source.state_dict()) 25 | 26 | def _should_update(self): 27 | return self._updates % self._update_frequency == 0 28 | -------------------------------------------------------------------------------- /rlil/approximation/target/polyak.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | from .abstract import TargetNetwork 4 | 5 | 6 | class PolyakTarget(TargetNetwork): 7 | '''TargetNetwork that updates using polyak averaging''' 8 | 9 | def __init__(self, rate): 10 | self._source = None 11 | self._target = None 12 | self._rate = rate 13 | 14 | def __call__(self, *inputs): 15 | with torch.no_grad(): 16 | return self._target(*inputs) 17 | 18 | def init(self, model): 19 | self._source = model 20 | self._target = copy.deepcopy(model) 21 | 22 | def update(self): 23 | for target_param, source_param in zip(self._target.parameters(), self._source.parameters()): 24 | target_param.data.copy_( 25 | target_param.data * (1.0 - self._rate) + 26 | source_param.data * self._rate 27 | ) 28 | -------------------------------------------------------------------------------- /rlil/approximation/target/trivial.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .abstract import TargetNetwork 3 | 4 | 5 | class TrivialTarget(TargetNetwork): 6 | def __init__(self): 7 | self._target = None 8 | 9 | def __call__(self, *inputs): 10 | with torch.no_grad(): 11 | return self._target(*inputs) 12 | 13 | def init(self, model): 14 | self._target = model 15 | 16 | def update(self): 17 | pass 18 | -------------------------------------------------------------------------------- /rlil/approximation/v_network.py: -------------------------------------------------------------------------------- 1 | from rlil.nn import RLNetwork 2 | from .approximation import Approximation 3 | 4 | 5 | class VNetwork(Approximation): 6 | def __init__( 7 | self, 8 | model, 9 | optimizer, 10 | name='v', 11 | **kwargs 12 | ): 13 | model = VModule(model) 14 | super().__init__( 15 | model, 16 | optimizer, 17 | name=name, 18 | **kwargs 19 | ) 20 | 21 | 22 | class VModule(RLNetwork): 23 | def forward(self, states): 24 | return super().forward(states).squeeze(-1) 25 | -------------------------------------------------------------------------------- /rlil/environments/reward_fns.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import gym 3 | import torch 4 | import numpy as np 5 | 6 | 7 | class PendulumReward: 8 | def __call__(self, states, next_states, actions): 9 | # reward function of Pendulum-v0 10 | thetas = torch.atan2(states.features[:, 1], states.features[:, 0]) 11 | theta_dots = states.features[:, 2] 12 | 13 | def angle_normalize(x): 14 | return (((x+np.pi) % (2*np.pi)) - np.pi) 15 | 16 | costs = angle_normalize(thetas) ** 2 \ 17 | + .1 * theta_dots ** 2 \ 18 | + .001*(actions.features.squeeze()**2) 19 | return -costs 20 | 21 | 22 | class MountainCarContinuousReward: 23 | def __init__(self): 24 | self.goal_position = 0.45 25 | self.goal_velocity = 0 26 | 27 | def __call__(self, states, next_states, actions): 28 | positions = states.features[:, 0] 29 | velocities = states.features[:, 1] 30 | goals = (positions >= self.goal_position) & ( 31 | velocities >= self.goal_velocity) 32 | 33 | rewards = torch.zeros(len(states), dtype=torch.float32) 34 | rewards += goals * 100.0 35 | rewards -= actions.features[:, 0] ** 2 * 0.1 36 | return rewards 37 | -------------------------------------------------------------------------------- /rlil/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | from .experiment import Experiment 2 | from .trainer import Trainer 3 | 4 | __all__ = [ 5 | "Experiment", 6 | "Trainer" 7 | ] 8 | -------------------------------------------------------------------------------- /rlil/experiments/experiment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rlil.utils.writer import ExperimentWriter 3 | from rlil.initializer import get_logger, get_writer, set_writer, set_logger, set_seed 4 | from rlil.samplers import AsyncSampler 5 | from .trainer import Trainer 6 | import os 7 | import logging 8 | import json 9 | import git 10 | import warnings 11 | 12 | 13 | class Experiment: 14 | def __init__( 15 | self, 16 | agent_fn, 17 | env, 18 | agent_name=None, 19 | args_dict={}, 20 | exp_info="default_experiments", 21 | seed=0, 22 | trains_per_episode=20, 23 | num_workers=1, 24 | num_workers_eval=1, 25 | max_sample_frames=np.inf, 26 | max_sample_episodes=np.inf, 27 | max_train_steps=np.inf, 28 | train_minutes=np.inf 29 | ): 30 | # set_seed 31 | set_seed(seed) 32 | 33 | # set writer 34 | if agent_name is None: 35 | agent_name = agent_fn.__name__[1:].replace("_", "-") 36 | writer = self._make_writer(agent_name, env.name, exp_info) 37 | message = "\n# Experiment: " + exp_info 38 | message += " \n# Parameters: \n" 39 | message += json.dumps(args_dict, indent=4, 40 | sort_keys=True).replace("\n", " \n") 41 | 42 | # write git diff 43 | try: 44 | repo = git.Repo('./') 45 | t = repo.head.commit.tree 46 | diff = repo.git.diff(t).replace("\n", " \n") 47 | message += " \n# Git diff: \n" + diff 48 | except git.InvalidGitRepositoryError: 49 | warnings.warn( 50 | "Current repository doesn't have .git. git diff is not recorded.") 51 | 52 | writer.add_text("exp_summary", message) 53 | set_writer(writer) 54 | 55 | # set logger 56 | logger = get_logger() 57 | handler = logging.FileHandler( 58 | os.path.join(writer.log_dir, "logger.log")) 59 | fmt = logging.Formatter('%(levelname)s : %(asctime)s : %(message)s') 60 | handler.setFormatter(fmt) 61 | logger.addHandler(handler) 62 | set_logger(logger) 63 | 64 | # save args 65 | with open(os.path.join(writer.log_dir, "args.json"), mode="w") as f: 66 | json.dump(args_dict, f) 67 | 68 | # start training 69 | agent = agent_fn(env) 70 | 71 | sampler = AsyncSampler(env, num_workers=num_workers) \ 72 | if num_workers > 0 else None 73 | eval_sampler = AsyncSampler(env, num_workers=num_workers_eval) \ 74 | if num_workers_eval > 0 else None 75 | 76 | trainer = Trainer( 77 | agent=agent, 78 | sampler=sampler, 79 | eval_sampler=eval_sampler, 80 | trains_per_episode=trains_per_episode, 81 | max_sample_frames=max_sample_frames, 82 | max_sample_episodes=max_sample_episodes, 83 | max_train_steps=max_train_steps, 84 | train_minutes=train_minutes 85 | ) 86 | 87 | trainer.start_training() 88 | 89 | def _make_writer(self, agent_name, env_name, exp_info): 90 | return ExperimentWriter(agent_name=agent_name, 91 | env_name=env_name, 92 | exp_info=exp_info) 93 | -------------------------------------------------------------------------------- /rlil/initializer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | import logging 5 | from rlil.utils.writer import DummyWriter 6 | 7 | os.environ["PYTHONWARNINGS"] = 'ignore:semaphore_tracker:UserWarning' 8 | 9 | _DEBUG_MODE = False 10 | 11 | 12 | def enable_debug_mode(): 13 | global _DEBUG_MODE 14 | print("-----DEBUG_MODE: True-----") 15 | torch.autograd.set_detect_anomaly(True) 16 | _DEBUG_MODE = True 17 | 18 | 19 | def disable_debug_mode(): 20 | global _DEBUG_MODE 21 | print("-----DEBUG_MODE: False-----") 22 | _DEBUG_MODE = False 23 | 24 | 25 | def is_debug_mode(): 26 | global _DEBUG_MODE 27 | return _DEBUG_MODE 28 | 29 | 30 | _DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 31 | 32 | 33 | def set_device(device): 34 | global _DEVICE 35 | _DEVICE = device 36 | 37 | 38 | def get_device(): 39 | return _DEVICE 40 | 41 | 42 | _SEED = 0 43 | 44 | 45 | def set_seed(seed): 46 | global _SEED 47 | np.random.seed(seed) 48 | torch.manual_seed(seed) 49 | if torch.cuda.is_available(): 50 | torch.cuda.manual_seed_all(seed) 51 | _SEED = seed 52 | print("-----SEED: {}-----".format(_SEED)) 53 | 54 | 55 | def call_seed(): 56 | global _SEED 57 | np.random.seed(_SEED) 58 | torch.manual_seed(_SEED) 59 | if torch.cuda.is_available(): 60 | torch.cuda.manual_seed_all(_SEED) 61 | return _SEED 62 | 63 | 64 | _WRITER = DummyWriter() 65 | 66 | 67 | def set_writer(writer): 68 | global _WRITER 69 | _WRITER = writer 70 | 71 | 72 | def get_writer(): 73 | return _WRITER 74 | 75 | 76 | _LOGGER = logging.getLogger(__name__) 77 | 78 | 79 | def set_logger(logger): 80 | global _LOGGER 81 | _LOGGER = logger 82 | 83 | 84 | def get_logger(): 85 | return _LOGGER 86 | 87 | 88 | _REPLAY_BUFFER = None 89 | 90 | 91 | def set_replay_buffer(replay_buffer): 92 | global _REPLAY_BUFFER 93 | _REPLAY_BUFFER = replay_buffer 94 | 95 | 96 | def get_replay_buffer(): 97 | global _REPLAY_BUFFER 98 | if _REPLAY_BUFFER is None: 99 | raise ValueError("replay_buffer is not set") 100 | return _REPLAY_BUFFER 101 | 102 | 103 | _ON_POLICY_MODE = False 104 | 105 | 106 | def enable_on_policy_mode(): 107 | global _ON_POLICY_MODE 108 | _ON_POLICY_MODE = True 109 | print("-----ON_POLICY_MODE: {}-----".format(_ON_POLICY_MODE)) 110 | 111 | 112 | def disable_on_policy_mode(): 113 | global _ON_POLICY_MODE 114 | _ON_POLICY_MODE = False 115 | print("-----ON_POLICY_MODE: {}-----".format(_ON_POLICY_MODE)) 116 | 117 | 118 | def is_on_policy_mode(): 119 | global _ON_POLICY_MODE 120 | return _ON_POLICY_MODE 121 | 122 | 123 | # parameters of NstepExperienceReplay 124 | _NSTEP = 1 125 | _DISCOUNT_FACTOR = 0.95 126 | 127 | 128 | def set_n_step(n_step, discount_factor=0.95): 129 | global _NSTEP, _DISCOUNT_FACTOR 130 | _NSTEP = n_step 131 | _DISCOUNT_FACTOR = discount_factor 132 | print("-----N step: {}-----".format(_NSTEP)) 133 | print("-----Discount factor: {}-----".format(_DISCOUNT_FACTOR)) 134 | 135 | 136 | def get_n_step(): 137 | global _NSTEP, _DISCOUNT_FACTOR 138 | return _NSTEP, _DISCOUNT_FACTOR 139 | 140 | 141 | _USE_APEX = False 142 | 143 | 144 | def enable_apex(): 145 | global _USE_APEX 146 | _USE_APEX = True 147 | print("-----USE_APEX: {}-----".format(_USE_APEX)) 148 | 149 | 150 | def disable_apex(): 151 | global _USE_APEX 152 | _USE_APEX = False 153 | print("-----USE_APEX: {}-----".format(_USE_APEX)) 154 | 155 | 156 | def use_apex(): 157 | global _USE_APEX 158 | return _USE_APEX 159 | -------------------------------------------------------------------------------- /rlil/memory/__init__.py: -------------------------------------------------------------------------------- 1 | from .replay_buffer import ( 2 | BaseReplayBuffer, 3 | ExperienceReplayBuffer, 4 | ) 5 | from .gail_wrapper import GailWrapper 6 | from .gae_wrapper import GaeWrapper 7 | from .sqil_wrapper import SqilWrapper 8 | from .airl_wrapper import AirlWrapper 9 | from cpprb import ReplayBuffer 10 | 11 | 12 | __all__ = [ 13 | "ReplayBuffer", 14 | "BaseReplayBuffer", 15 | "ExperienceReplayBuffer", 16 | "GailWrapper", 17 | "GaeWrapper", 18 | "SqilWrapper", 19 | "AirlWrapper" 20 | ] 21 | -------------------------------------------------------------------------------- /rlil/memory/airl_wrapper.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import numpy as np 3 | import torch 4 | from rlil.environments import State, Action 5 | from rlil.initializer import get_device, is_debug_mode 6 | from .gail_wrapper import GailWrapper 7 | 8 | 9 | class AirlWrapper(GailWrapper): 10 | """ 11 | A wrapper of ExperienceReplayBuffer for rlil.agents.AIRL. 12 | """ 13 | 14 | def __init__(self, 15 | buffer, 16 | expert_buffer, 17 | reward_fn, 18 | value_fn, 19 | policy, 20 | feature_nw=None, 21 | discount_factor=1.0): 22 | """ 23 | Args: 24 | buffer (rlil.memory.ExperienceReplayBuffer): 25 | A replay_buffer for sampling. 26 | expert_buffer (rlil.memory.ExperienceReplayBuffer): 27 | A replay_buffer with expert trajectories. 28 | reward_fn (rlil.approximation.Approximation): 29 | A reward function approximation. 30 | value_fn (rlil.approximation.Approximation): 31 | A value function approximation. 32 | policy (rlil.policies): 33 | A policy approximation 34 | feature_nw (rlil.approximation.FeatureNetwork) 35 | """ 36 | self.buffer = buffer 37 | self.expert_buffer = expert_buffer 38 | self.device = get_device() 39 | self.reward_fn = reward_fn 40 | self.value_fn = value_fn 41 | self.policy = policy 42 | self.feature_nw = feature_nw 43 | self.discount_factor = discount_factor 44 | 45 | def sample(self, batch_size): 46 | # replace the rewards with gail rewards 47 | states, actions, rewards, next_states, weights, indexes = \ 48 | self.buffer.sample(batch_size) 49 | 50 | ds = self.discrim(states, actions, next_states) 51 | rewards = self.expert_reward(ds) 52 | return (states, actions, rewards, next_states, weights, indexes) 53 | 54 | def discrim(self, states, actions, next_states): 55 | if self.feature_nw is None: 56 | features = states 57 | else: 58 | features = self.feature_nw.no_grad(states) 59 | policy_prob = self.policy.no_grad(features).log_prob( 60 | actions.features).exp() 61 | 62 | f = self.reward_fn( 63 | torch.cat((states.features, actions.features), dim=1)).squeeze(1) \ 64 | + next_states.mask.float() \ 65 | * (self.discount_factor * self.value_fn(next_states) 66 | - self.value_fn(states)) 67 | f_exp = f.exp() 68 | d = f_exp / (f_exp + policy_prob) 69 | return d 70 | 71 | def expert_reward(self, d): 72 | return (torch.log(d) - torch.log(1 - d)).squeeze().detach() 73 | -------------------------------------------------------------------------------- /rlil/memory/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class BaseReplayBuffer(ABC): 5 | @abstractmethod 6 | def store(self, states, actions, rewards, next_states): 7 | """Store the transition in the buffer 8 | Args: 9 | states (rlil.environment.State): batch_size x shape 10 | actions (rlil.environment.Action): batch_size x shape 11 | rewards (torch.Tensor): batch_size 12 | next_states (rlil.environment.State): batch_size x shape 13 | """ 14 | 15 | @abstractmethod 16 | def sample(self, batch_size): 17 | '''Sample from the stored transitions''' 18 | 19 | @abstractmethod 20 | def update_priorities(self, indexes, td_errors): 21 | '''Update priorities based on the TD error''' 22 | 23 | @abstractmethod 24 | def get_all_transitions(self): 25 | '''Return all the samples''' 26 | 27 | @abstractmethod 28 | def clear(self): 29 | '''Clear replay buffer''' 30 | 31 | 32 | class BaseBufferWrapper(ABC): 33 | def __init__(self, buffer): 34 | self.buffer = buffer 35 | 36 | def store(self, *args, **kwargs): 37 | self.buffer.store(*args, **kwargs) 38 | 39 | def sample(self, *args, **kwargs): 40 | return self.buffer.sample(*args, **kwargs) 41 | 42 | def update_priorities(self, *args, **kwargs): 43 | self.buffer.update_priorities(*args, **kwargs) 44 | 45 | def clear(self): 46 | self.buffer.clear() 47 | 48 | def get_all_transitions(self): 49 | return self.buffer.get_all_transitions() 50 | 51 | def samples_from_cpprb(self, *args, **kwargs): 52 | return self.buffer.samples_from_cpprb(*args, **kwargs) 53 | 54 | def __len__(self): 55 | return len(self.buffer) 56 | -------------------------------------------------------------------------------- /rlil/memory/gae_wrapper.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import numpy as np 3 | import torch 4 | from rlil.environments import State, Action 5 | from rlil.initializer import get_device, is_debug_mode 6 | from .replay_buffer import ExperienceReplayBuffer 7 | from .base import BaseBufferWrapper 8 | 9 | 10 | class GaeWrapper(BaseBufferWrapper): 11 | """ 12 | A wrapper of ExperienceReplayBuffer for Generalized Advantage Estimation. 13 | https://arxiv.org/abs/1506.02438 14 | """ 15 | 16 | def __init__(self, buffer, discount_factor=1, lam=1): 17 | """ 18 | Args: 19 | buffer (rlil.memory.ExperienceReplayBuffer): 20 | A replay_buffer for sampling. 21 | """ 22 | self.buffer = buffer 23 | self.device = get_device() 24 | self.discount_factor = discount_factor 25 | self.lam = lam 26 | 27 | def compute_gae(self, rewards, values, next_values, masks): 28 | td_errors = rewards + self.discount_factor * next_values - values 29 | 30 | # compute_gaes 31 | length = len(td_errors) 32 | gaes = torch.zeros(length, device=self.device) 33 | 34 | gae = 0.0 35 | for i in reversed(range(length)): 36 | mask = masks[i].float() 37 | gae = td_errors[i] + self.discount_factor * self.lam * gae * mask 38 | gaes[i] = gae 39 | 40 | # normalize Advantage 41 | # see: https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/issues/102 42 | gaes = (gaes - gaes.mean()) / gaes.std() 43 | return gaes -------------------------------------------------------------------------------- /rlil/memory/gail_wrapper.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import numpy as np 3 | import torch 4 | from rlil.environments import State, Action 5 | from rlil.initializer import get_device, is_debug_mode 6 | from .replay_buffer import ExperienceReplayBuffer 7 | from .base import BaseBufferWrapper 8 | from .gae_wrapper import GaeWrapper 9 | 10 | 11 | class GailWrapper(BaseBufferWrapper): 12 | """ 13 | A wrapper of ExperienceReplayBuffer for rlil.agents.GAIL. 14 | """ 15 | 16 | def __init__(self, buffer, expert_buffer, discriminator): 17 | """ 18 | Args: 19 | buffer (rlil.memory.ExperienceReplayBuffer): 20 | A replay_buffer for sampling. 21 | expert_buffer (rlil.memory.ExperienceReplayBuffer): 22 | A replay_buffer with expert trajectories. 23 | discriminator (rlil.approximation.Discriminator): 24 | A discriminator approximation. 25 | """ 26 | self.buffer = buffer 27 | self.expert_buffer = expert_buffer 28 | self.device = get_device() 29 | self.discriminator = discriminator 30 | 31 | def sample(self, batch_size): 32 | # replace the rewards with gail rewards 33 | states, actions, rewards, next_states, weights, indexes = \ 34 | self.buffer.sample(batch_size) 35 | 36 | rewards = self.discriminator.expert_reward( 37 | torch.cat((states.features, actions.features), dim=1)) 38 | return (states, actions, rewards, next_states, weights, indexes) 39 | 40 | def sample_both(self, batch_size): 41 | batch_size = int(batch_size / 2) 42 | samples = self.buffer.sample(batch_size) 43 | expert_samples = self.expert_buffer.sample(batch_size) 44 | return samples, expert_samples 45 | 46 | def get_all_transitions(self): 47 | # return the sampled trajectories 48 | # not including expert trajectories 49 | return self.buffer.get_all_transitions() 50 | 51 | def compute_gae(self, *args, **kwargs): 52 | # wrap function for GaeWrapper 53 | if isinstance(self.buffer, GaeWrapper): 54 | return self.buffer.compute_gae(*args, **kwargs) 55 | 56 | def clear(self): 57 | self.buffer.clear() 58 | 59 | def __len__(self): 60 | # return the number of sampled trajectories 61 | # not including expert trajectories 62 | return len(self.buffer) 63 | -------------------------------------------------------------------------------- /rlil/memory/sqil_wrapper.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import numpy as np 3 | import torch 4 | from rlil.environments import State, Action 5 | from rlil.initializer import get_device, is_debug_mode 6 | from .replay_buffer import ExperienceReplayBuffer 7 | from .base import BaseBufferWrapper 8 | from .gae_wrapper import GaeWrapper 9 | 10 | 11 | class SqilWrapper(BaseBufferWrapper): 12 | """ 13 | SQIL is a behavior cloning method which regularizes the 14 | reward to sparse by giving the agent a constant 15 | reward of r = +1 for matching the demonstrated action in 16 | a demonstrated state, and giving the agent a constant reward 17 | of r = 0 for all other behavior. 18 | https://arxiv.org/abs/1905.11108 19 | """ 20 | 21 | def __init__(self, buffer, expert_buffer): 22 | """ 23 | Args: 24 | buffer (rlil.memory.ExperienceReplayBuffer): 25 | A replay_buffer for sampling. 26 | expert_buffer (rlil.memory.ExperienceReplayBuffer): 27 | A replay_buffer with expert trajectories. 28 | """ 29 | self.buffer = buffer 30 | self.expert_buffer = expert_buffer 31 | self.device = get_device() 32 | 33 | def sample(self, batch_size): 34 | batch_size = int(batch_size / 2) 35 | states, actions, rewards, next_states, weights, indexes = \ 36 | self.buffer.sample(batch_size) 37 | exp_states, exp_actions, exp_rewards, exp_next_states, \ 38 | exp_weights, exp_indexes = self.expert_buffer.sample(batch_size) 39 | 40 | rewards = torch.zeros_like(rewards, dtype=torch.float32, 41 | device=self.device) 42 | exp_rewards = torch.ones_like(exp_rewards, dtype=torch.float32, 43 | device=self.device) 44 | 45 | states = State.from_list([states, exp_states]) 46 | actions = Action.from_list([actions, exp_actions]) 47 | rewards = torch.cat([rewards, exp_rewards], axis=0) 48 | next_states = State.from_list([next_states, exp_next_states]) 49 | weights = torch.cat([weights, exp_weights], axis=0) 50 | 51 | # shuffle tensors 52 | index = torch.randperm(len(rewards)) 53 | if indexes is None or exp_indexes is None: 54 | indexes = None 55 | else: 56 | indexes = torch.cat([indexes, exp_indexes], axis=0)[index] 57 | 58 | return (states[index], 59 | actions[index], 60 | rewards[index], 61 | next_states[index], 62 | weights[index], 63 | indexes) 64 | -------------------------------------------------------------------------------- /rlil/policies/__init__.py: -------------------------------------------------------------------------------- 1 | from .gaussian import GaussianPolicy 2 | from .softmax import SoftmaxPolicy 3 | from .deterministic import DeterministicPolicy 4 | from .bcq_deterministic import BCQDeterministicPolicy 5 | from .soft_deterministic import SoftDeterministicPolicy 6 | 7 | __all__ = [ 8 | "GaussianPolicy", 9 | "SoftmaxPolicy", 10 | "DeterministicPolicy", 11 | "BCQDeterministicPolicy", 12 | "SoftDeterministicPolicy" 13 | ] 14 | -------------------------------------------------------------------------------- /rlil/policies/bcq_deterministic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from rlil.environments import squash_action 3 | from rlil.approximation import Approximation 4 | from rlil.nn import RLNetwork 5 | 6 | 7 | class BCQDeterministicPolicy(Approximation): 8 | def __init__( 9 | self, 10 | model, 11 | optimizer, 12 | space, 13 | phi=0.05, 14 | name='policy', 15 | **kwargs 16 | ): 17 | model = BCQDeterministicPolicyNetwork(model, space, phi) 18 | super().__init__( 19 | model, 20 | optimizer, 21 | name=name, 22 | **kwargs 23 | ) 24 | 25 | 26 | class BCQDeterministicPolicyNetwork(RLNetwork): 27 | def __init__(self, model, space, phi=0.05): 28 | super().__init__(model) 29 | self._tanh_scale = torch.tensor( 30 | (space.high - space.low) / 2, 31 | dtype=torch.float32, device=self.device) 32 | self._tanh_mean = torch.tensor( 33 | (space.high + space.low) / 2, 34 | dtype=torch.float32, device=self.device) 35 | self.phi = phi 36 | 37 | def forward(self, states, vae_actions): 38 | x = torch.cat((states.features.float(), 39 | vae_actions.features.float()), dim=1) 40 | actions = self.model(x) * states.mask.float().unsqueeze(-1) 41 | actions = self.phi * \ 42 | squash_action(actions, self._tanh_scale, self._tanh_mean) 43 | return vae_actions.features + actions 44 | 45 | def to(self, device): 46 | self._tanh_mean = self._tanh_mean.to(device) 47 | self._tanh_scale = self._tanh_scale.to(device) 48 | return super().to(device) 49 | -------------------------------------------------------------------------------- /rlil/policies/deterministic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from rlil.environments import squash_action 3 | from rlil.approximation import Approximation 4 | from rlil.nn import RLNetwork 5 | 6 | 7 | class DeterministicPolicy(Approximation): 8 | def __init__( 9 | self, 10 | model, 11 | optimizer, 12 | space, 13 | name='policy', 14 | **kwargs 15 | ): 16 | model = DeterministicPolicyNetwork(model, space) 17 | super().__init__( 18 | model, 19 | optimizer, 20 | name=name, 21 | **kwargs 22 | ) 23 | 24 | 25 | class DeterministicPolicyNetwork(RLNetwork): 26 | def __init__(self, model, space): 27 | super().__init__(model) 28 | self._tanh_scale = torch.tensor( 29 | (space.high - space.low) / 2).to(self.device) 30 | self._tanh_mean = torch.tensor( 31 | (space.high + space.low) / 2).to(self.device) 32 | 33 | def forward(self, state): 34 | return squash_action(super().forward(state), 35 | self._tanh_scale, self._tanh_mean) 36 | 37 | def to(self, device): 38 | self._tanh_mean = self._tanh_mean.to(device) 39 | self._tanh_scale = self._tanh_scale.to(device) 40 | return super().to(device) 41 | -------------------------------------------------------------------------------- /rlil/policies/gaussian.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.distributions.independent import Independent 3 | from torch.distributions.normal import Normal 4 | from rlil.environments import squash_action 5 | from rlil.approximation import Approximation 6 | from rlil.nn import RLNetwork 7 | 8 | 9 | class GaussianPolicy(Approximation): 10 | def __init__( 11 | self, 12 | model, 13 | optimizer, 14 | space, 15 | name='policy', 16 | **kwargs 17 | ): 18 | super().__init__( 19 | GaussianPolicyNetwork(model, space), 20 | optimizer, 21 | name=name, 22 | **kwargs 23 | ) 24 | 25 | 26 | class GaussianPolicyNetwork(RLNetwork): 27 | def __init__(self, model, space): 28 | super().__init__(model) 29 | self._action_dim = space.shape[0] 30 | 31 | def forward(self, state, return_mean=False): 32 | outputs = super().forward(state) 33 | means = outputs[:, :self._action_dim] 34 | 35 | if return_mean: 36 | return means 37 | 38 | logvars = outputs[:, self._action_dim:] 39 | std = logvars.exp_() 40 | return Independent(Normal(means, std), 1) 41 | 42 | def to(self, device): 43 | return super().to(device) 44 | -------------------------------------------------------------------------------- /rlil/policies/soft_deterministic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torch.nn.functional as F 4 | from rlil.approximation import Approximation 5 | from rlil.nn import RLNetwork 6 | from rlil.environments import squash_action 7 | 8 | 9 | class SoftDeterministicPolicy(Approximation): 10 | def __init__( 11 | self, 12 | model, 13 | optimizer, 14 | space, 15 | name="policy", 16 | **kwargs 17 | ): 18 | model = SoftDeterministicPolicyNetwork(model, space) 19 | super().__init__(model, optimizer, name=name, **kwargs) 20 | 21 | def sample_multiple(self, states, num_sample=10): 22 | return self.model.sample_multiple(states, num_sample) 23 | 24 | def compute_log_prob(self, raw, normal): 25 | return self.model.compute_log_prob(raw, normal) 26 | 27 | def mean_logvar(self, states): 28 | return self.model.mean_logvar(states) 29 | 30 | 31 | class SoftDeterministicPolicyNetwork(RLNetwork): 32 | def __init__(self, model, space): 33 | super().__init__(model) 34 | self._action_dim = space.shape[0] 35 | self._tanh_scale = torch.tensor( 36 | (space.high - space.low) / 2, 37 | dtype=torch.float32, device=self.device) 38 | self._tanh_mean = torch.tensor( 39 | (space.high + space.low) / 2, 40 | dtype=torch.float32, device=self.device) 41 | 42 | def forward(self, state, return_mean=False): 43 | outputs = super().forward(state) 44 | if return_mean: 45 | means = outputs[:, 0: self._action_dim] 46 | means = squash_action(means, self._tanh_scale, self._tanh_mean) 47 | return means 48 | 49 | # make normal distribution 50 | means = outputs[:, 0: self._action_dim] 51 | logvars = outputs[:, self._action_dim:] 52 | std = logvars.mul(0.5).exp_() 53 | normal = torch.distributions.normal.Normal(means, std) 54 | 55 | # sample from the normal distribution 56 | raw = normal.rsample() 57 | log_prob = self.compute_log_prob(raw, normal) 58 | 59 | action = squash_action(raw, self._tanh_scale, self._tanh_mean) 60 | return action, log_prob 61 | 62 | def sample_multiple(self, state, num_sample=10): 63 | # this function is used in BEAR and BRAC training 64 | outputs = super().forward(state) 65 | 66 | # make normal distribution 67 | means = outputs[:, 0: self._action_dim] 68 | repeated_means = torch.repeat_interleave( 69 | means.unsqueeze(1), num_sample, 1) 70 | logvars = outputs[:, self._action_dim:] 71 | repeated_logvars = torch.repeat_interleave( 72 | logvars.unsqueeze(1), num_sample, 1) 73 | repeated_std = repeated_logvars.mul(0.5).exp_() 74 | # batch x num_sample x d 75 | normal = torch.distributions.normal.Normal( 76 | repeated_means, repeated_std) 77 | raw = normal.rsample() 78 | action = squash_action(raw, self._tanh_scale, self._tanh_mean) 79 | return action, raw 80 | 81 | def compute_log_prob(self, raw, normal): 82 | # see openai spinningup for log_prob computation: 83 | # https://github.com/openai/spinningup/blob/e76f3cc1dfbf94fe052a36082dbd724682f0e8fd/spinup/algos/pytorch/sac/core.py#L53 84 | 85 | log_prob = normal.log_prob(raw).sum(axis=-1) 86 | log_prob -= (2*(np.log(2) - raw - F.softplus(-2*raw))).sum(axis=-1) 87 | return log_prob 88 | 89 | def mean_logvar(self, state): 90 | outputs = super().forward(state) 91 | means = outputs[:, 0: self._action_dim] 92 | logvars = outputs[:, self._action_dim:] 93 | return means, logvars 94 | 95 | def to(self, device): 96 | self._tanh_mean = self._tanh_mean.to(device) 97 | self._tanh_scale = self._tanh_scale.to(device) 98 | return super().to(device) 99 | -------------------------------------------------------------------------------- /rlil/policies/softmax.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional 3 | from rlil.nn import RLNetwork 4 | from rlil.approximation import Approximation 5 | 6 | 7 | class SoftmaxPolicy(Approximation): 8 | def __init__( 9 | self, 10 | model, 11 | optimizer, 12 | name='policy', 13 | **kwargs 14 | ): 15 | model = SoftmaxPolicyNetwork(model) 16 | super().__init__(model, optimizer, name=name, **kwargs) 17 | 18 | 19 | class SoftmaxPolicyNetwork(RLNetwork): 20 | def __init__(self, model): 21 | super().__init__(model) 22 | 23 | def forward(self, state): 24 | outputs = super().forward(state) 25 | probs = functional.softmax(outputs, dim=-1) 26 | return torch.distributions.Categorical(probs) 27 | -------------------------------------------------------------------------------- /rlil/presets/__init__.py: -------------------------------------------------------------------------------- 1 | from rlil.presets.validate_agent import env_validation, trainer_validation 2 | import inspect 3 | 4 | __all__ = ["env_validation", "trainer_validation"] 5 | 6 | 7 | def get_default_args(func): 8 | signature = inspect.signature(func) 9 | return { 10 | k: v.default 11 | for k, v in signature.parameters.items() 12 | if v.default is not inspect.Parameter.empty 13 | } -------------------------------------------------------------------------------- /rlil/presets/continuous/__init__.py: -------------------------------------------------------------------------------- 1 | # from .actor_critic import actor_critic 2 | from .vac import vac 3 | from .ddpg import ddpg 4 | from .sac import sac 5 | from .td3 import td3 6 | from .noisy_td3 import noisy_td3 7 | from .ppo import ppo 8 | from .bc import bc 9 | from .vae_bc import vae_bc 10 | from .bcq import bcq 11 | from .bear import bear 12 | from .brac import brac 13 | from .gail import gail 14 | from .sqil import sqil 15 | from .airl import airl 16 | from .rs_mpc import rs_mpc 17 | 18 | __all__ = ['vac', 19 | 'ddpg', 20 | 'sac', 21 | 'td3', 22 | 'noisy_td3', 23 | 'ppo', 24 | 'bcq', 25 | 'bear', 26 | 'brac', 27 | 'bc', 28 | 'vae_bc', 29 | 'gail', 30 | 'sqil', 31 | 'airl', 32 | 'rs_mpc'] 33 | -------------------------------------------------------------------------------- /rlil/presets/continuous/airl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Adam 3 | from rlil.agents import AIRL 4 | from rlil.initializer import get_device, set_replay_buffer, get_replay_buffer 5 | from .models import fc_reward, fc_v 6 | from rlil.approximation import Approximation, Discriminator, VNetwork 7 | from rlil.memory import ExperienceReplayBuffer, AirlWrapper 8 | 9 | 10 | def airl( 11 | transitions=None, 12 | base_agent_fn=None, 13 | # Common settings 14 | discount_factor=0.98, 15 | # Adam optimizer settings 16 | lr_r=2e-4, 17 | lr_v=2e-4, 18 | # Training settings 19 | minibatch_size=512, 20 | update_frequency=1, 21 | # Replay Buffer settings 22 | replay_start_size=5000, 23 | replay_buffer_size=1e6 24 | ): 25 | """ 26 | Adversarial Inverse Reinforcement Learning (AIRL) control preset 27 | 28 | Args: 29 | transitions: 30 | dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 31 | base_agent_fn (function): 32 | A function generated by a preset of an agent such as sac, td3, ddpg 33 | Currently, the base_agent_fn must be ppo preset. 34 | lr_r (float): Learning rate for the reward function network. 35 | lr_v (float): Learning rate for the value function network. 36 | update_frequency (int): Number of base_agent update per discriminator update. 37 | minibatch_size (int): Number of experiences to sample in each discriminator update. 38 | replay_start_size (int): Number of experiences in replay buffer when training begins. 39 | replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. 40 | """ 41 | def _airl(env): 42 | device = get_device() 43 | 44 | base_agent = base_agent_fn(env) 45 | 46 | reward_model = fc_reward(env).to(device) 47 | reward_optimizer = Adam(reward_model.parameters(), 48 | lr=lr_r) 49 | reward_fn = Approximation(reward_model, 50 | reward_optimizer, 51 | name='airl_rew') 52 | 53 | value_model = fc_v(env).to(device) 54 | value_optimizer = Adam(value_model.parameters(), 55 | lr=lr_v) 56 | value_fn = VNetwork(value_model, 57 | value_optimizer, 58 | name='airl_v') 59 | 60 | expert_replay_buffer = ExperienceReplayBuffer(1e7, env) 61 | if transitions is not None: 62 | samples = expert_replay_buffer.samples_from_cpprb( 63 | transitions, device="cpu") 64 | expert_replay_buffer.store(samples) 65 | 66 | replay_buffer = get_replay_buffer() 67 | replay_buffer = AirlWrapper(buffer=replay_buffer, 68 | expert_buffer=expert_replay_buffer, 69 | reward_fn=reward_fn, 70 | value_fn=value_fn, 71 | policy=base_agent.policy, 72 | feature_nw=base_agent.feature_nw, 73 | discount_factor=discount_factor) 74 | set_replay_buffer(replay_buffer) 75 | 76 | # replace base_agent's replay_buffer with gail_buffer 77 | base_agent.replay_buffer = replay_buffer 78 | 79 | return AIRL( 80 | base_agent=base_agent, 81 | minibatch_size=minibatch_size, 82 | replay_start_size=replay_start_size, 83 | update_frequency=update_frequency 84 | ) 85 | return _airl 86 | 87 | 88 | __all__ = ["airl"] 89 | -------------------------------------------------------------------------------- /rlil/presets/continuous/bc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Adam 3 | from rlil.agents import BC 4 | from rlil.initializer import (get_device, 5 | set_replay_buffer, 6 | disable_on_policy_mode) 7 | from rlil.policies import DeterministicPolicy 8 | from rlil.memory import ExperienceReplayBuffer 9 | from .models import fc_deterministic_policy 10 | 11 | 12 | def bc( 13 | transitions=None, 14 | # Adam optimizer settings 15 | lr_pi=1e-3, 16 | # Training settings 17 | minibatch_size=100, 18 | ): 19 | """ 20 | Behavioral Cloning (BC) control preset 21 | 22 | Args: 23 | transitions: 24 | dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 25 | lr_pi (float): Learning rate for the policy network. 26 | minibatch_size (int): Number of experiences to sample in each training update. 27 | """ 28 | def _bc(env): 29 | disable_on_policy_mode() 30 | device = get_device() 31 | 32 | policy_model = fc_deterministic_policy(env).to(device) 33 | policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) 34 | policy = DeterministicPolicy( 35 | policy_model, 36 | policy_optimizer, 37 | env.action_space, 38 | ) 39 | 40 | replay_buffer = ExperienceReplayBuffer(1e7, env) 41 | if transitions is not None: 42 | samples = replay_buffer.samples_from_cpprb( 43 | transitions, device="cpu") 44 | replay_buffer.store(samples) 45 | set_replay_buffer(replay_buffer) 46 | 47 | return BC( 48 | policy=policy, 49 | minibatch_size=minibatch_size, 50 | ) 51 | return _bc 52 | 53 | 54 | __all__ = ["bc"] 55 | -------------------------------------------------------------------------------- /rlil/presets/continuous/bear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.optim import Adam 4 | from rlil.agents import BEAR 5 | from rlil.approximation import (EnsembleQContinuous, 6 | PolyakTarget, 7 | BcqEncoder, 8 | BcqDecoder) 9 | from rlil.policies import SoftDeterministicPolicy 10 | from rlil.memory import ExperienceReplayBuffer 11 | from rlil.initializer import (get_device, 12 | set_replay_buffer, 13 | disable_on_policy_mode) 14 | from .models import (fc_q, 15 | fc_soft_policy, 16 | fc_bcq_encoder, 17 | fc_bcq_decoder) 18 | 19 | 20 | def bear( 21 | transitions=None, 22 | # Common settings 23 | discount_factor=0.99, 24 | # Adam optimizer settings 25 | lr_q=1e-3, 26 | lr_pi=1e-3, 27 | lr_enc=1e-3, 28 | lr_dec=1e-3, 29 | # Training settings 30 | minibatch_size=100, 31 | polyak_rate=0.005, 32 | # BEAR settings 33 | num_qs=2, 34 | kernel_type="laplacian", 35 | ): 36 | """ 37 | Bootstrapping error accumulation reduction (BEAR) control preset 38 | 39 | Args: 40 | transitions: 41 | dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 42 | discount_factor (float): Discount factor for future rewards. 43 | lr_q (float): Learning rate for the Q network. 44 | lr_pi (float): Learning rate for the policy network. 45 | lr_enc (float): Learning rate for the encoder. 46 | lr_dec (float): Learning rate for the decoder. 47 | minibatch_size (int): Number of experiences to sample in each training update. 48 | polyak_rate (float): Speed with which to update the target network towards the online network. 49 | num_qs (int): Number of q functions for ensemble. 50 | """ 51 | def _bear(env): 52 | disable_on_policy_mode() 53 | 54 | device = get_device() 55 | q_models = nn.ModuleList([fc_q(env) for _ in range(num_qs)]).to(device) 56 | qs_optimizer = Adam(q_models.parameters(), lr=lr_q) 57 | qs = EnsembleQContinuous( 58 | q_models, 59 | qs_optimizer, 60 | target=PolyakTarget(polyak_rate), 61 | name='qs' 62 | ) 63 | 64 | policy_model = fc_soft_policy(env).to(device) 65 | policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) 66 | policy = SoftDeterministicPolicy( 67 | policy_model, 68 | policy_optimizer, 69 | env.action_space, 70 | target=PolyakTarget(polyak_rate), 71 | ) 72 | 73 | latent_dim = env.action_space.shape[0] * 2 74 | encoder_model = fc_bcq_encoder(env, latent_dim=latent_dim).to(device) 75 | encoder_optimizer = Adam(encoder_model.parameters(), lr=lr_enc) 76 | encoder = BcqEncoder( 77 | model=encoder_model, 78 | latent_dim=latent_dim, 79 | optimizer=encoder_optimizer, 80 | name="encoder", 81 | ) 82 | decoder_model = fc_bcq_decoder(env, latent_dim=latent_dim).to(device) 83 | decoder_optimizer = Adam(decoder_model.parameters(), lr=lr_dec) 84 | decoder = BcqDecoder( 85 | model=decoder_model, 86 | latent_dim=latent_dim, 87 | space=env.action_space, 88 | optimizer=decoder_optimizer, 89 | name="decoder", 90 | ) 91 | 92 | replay_buffer = ExperienceReplayBuffer(1e7, env) 93 | if transitions is not None: 94 | samples = replay_buffer.samples_from_cpprb( 95 | transitions, device="cpu") 96 | replay_buffer.store(samples) 97 | set_replay_buffer(replay_buffer) 98 | 99 | return BEAR( 100 | qs=qs, 101 | encoder=encoder, 102 | decoder=decoder, 103 | policy=policy, 104 | kernel_type=kernel_type, 105 | discount_factor=discount_factor, 106 | minibatch_size=minibatch_size, 107 | ) 108 | return _bear 109 | 110 | 111 | __all__ = ["bear"] 112 | -------------------------------------------------------------------------------- /rlil/presets/continuous/brac.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.optim import Adam 4 | from rlil.agents import BRAC 5 | from rlil.approximation import (QContinuous, 6 | PolyakTarget, 7 | BcqEncoder, 8 | BcqDecoder) 9 | from rlil.policies import SoftDeterministicPolicy 10 | from rlil.memory import ExperienceReplayBuffer 11 | from rlil.initializer import (get_device, 12 | set_replay_buffer, 13 | disable_on_policy_mode) 14 | from .models import (fc_q, 15 | fc_soft_policy, 16 | fc_bcq_encoder, 17 | fc_bcq_decoder) 18 | 19 | 20 | def brac( 21 | transitions=None, 22 | # Common settings 23 | discount_factor=0.99, 24 | # Adam optimizer settings 25 | lr_q=1e-3, 26 | lr_pi=1e-3, 27 | # Training settings 28 | bc_iters=5000, 29 | minibatch_size=100, 30 | polyak_rate=0.005, 31 | alpha=0.1 32 | ): 33 | """ 34 | Bootstrapping error accumulation reduction (BEAR) control preset 35 | 36 | Args: 37 | transitions: 38 | dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 39 | discount_factor (float): Discount factor for future rewards. 40 | lr_q (float): Learning rate for the Q network. 41 | lr_pi (float): Learning rate for the policy network. 42 | alpha (float): Value of lagrange multipliers. Trick 3. 43 | minibatch_size (int): Number of experiences to sample in each training update. 44 | polyak_rate (float): Speed with which to update the target network towards the online network. 45 | """ 46 | def _brac(env): 47 | disable_on_policy_mode() 48 | 49 | device = get_device() 50 | q_1_model = fc_q(env).to(device) 51 | q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q) 52 | q_1 = QContinuous( 53 | q_1_model, 54 | q_1_optimizer, 55 | target=PolyakTarget(polyak_rate), 56 | name='q_1' 57 | ) 58 | 59 | q_2_model = fc_q(env).to(device) 60 | q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q) 61 | q_2 = QContinuous( 62 | q_2_model, 63 | q_2_optimizer, 64 | target=PolyakTarget(polyak_rate), 65 | name='q_2' 66 | ) 67 | 68 | policy_model = fc_soft_policy(env).to(device) 69 | policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) 70 | policy = SoftDeterministicPolicy( 71 | policy_model, 72 | policy_optimizer, 73 | env.action_space, 74 | target=PolyakTarget(polyak_rate), 75 | ) 76 | 77 | behavior_model = fc_soft_policy(env).to(device) 78 | behavior_optimizer = Adam(behavior_model.parameters(), lr=lr_pi) 79 | behavior_policy = SoftDeterministicPolicy( 80 | behavior_model, 81 | behavior_optimizer, 82 | env.action_space, 83 | target=PolyakTarget(polyak_rate), 84 | name='behavior_policy' 85 | ) 86 | 87 | replay_buffer = ExperienceReplayBuffer(1e7, env) 88 | if transitions is not None: 89 | samples = replay_buffer.samples_from_cpprb( 90 | transitions, device="cpu") 91 | replay_buffer.store(samples) 92 | set_replay_buffer(replay_buffer) 93 | 94 | return BRAC( 95 | q_1=q_1, 96 | q_2=q_2, 97 | policy=policy, 98 | behavior_policy=behavior_policy, 99 | bc_iters=bc_iters, 100 | alpha=alpha, 101 | discount_factor=discount_factor, 102 | minibatch_size=minibatch_size, 103 | ) 104 | return _brac 105 | 106 | 107 | __all__ = ["brac"] 108 | -------------------------------------------------------------------------------- /rlil/presets/continuous/ddpg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Adam 3 | from rlil.agents import DDPG 4 | from rlil.approximation import QContinuous, PolyakTarget 5 | from rlil.policies import DeterministicPolicy 6 | from rlil.memory import ExperienceReplayBuffer 7 | from rlil.initializer import (get_device, 8 | set_replay_buffer, 9 | disable_on_policy_mode, 10 | set_n_step, 11 | enable_apex) 12 | from .models import fc_q, fc_deterministic_policy 13 | 14 | 15 | def ddpg( 16 | # Common settings 17 | discount_factor=0.99, 18 | # Adam optimizer settings 19 | lr_q=1e-3, 20 | lr_pi=1e-3, 21 | # Training settings 22 | minibatch_size=512, 23 | polyak_rate=0.005, 24 | # Replay Buffer settings 25 | replay_start_size=5000, 26 | replay_buffer_size=1e7, 27 | prioritized=False, 28 | use_apex=False, 29 | n_step=1, 30 | # Exploration settings 31 | noise=0.1, 32 | ): 33 | """ 34 | DDPG continuous control preset. 35 | 36 | Args: 37 | discount_factor (float): Discount factor for future rewards. 38 | lr_q (float): Learning rate for the Q network. 39 | lr_pi (float): Learning rate for the policy network. 40 | minibatch_size (int): Number of experiences to sample in each training update. 41 | polyak_rate (float): Speed with which to update the target network towards the online network. 42 | replay_start_size (int): Number of experiences in replay buffer when training begins. 43 | replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. 44 | prioritized (bool): Use prioritized experience replay if True. 45 | use_apex (bool): Use apex if True. 46 | n_step (int): Number of steps for N step experience replay. 47 | noise (float): The amount of exploration noise to add. 48 | """ 49 | def _ddpg(env): 50 | disable_on_policy_mode() 51 | 52 | device = get_device() 53 | q_model = fc_q(env).to(device) 54 | q_optimizer = Adam(q_model.parameters(), lr=lr_q) 55 | q = QContinuous( 56 | q_model, 57 | q_optimizer, 58 | target=PolyakTarget(polyak_rate), 59 | ) 60 | 61 | policy_model = fc_deterministic_policy(env).to(device) 62 | policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) 63 | policy = DeterministicPolicy( 64 | policy_model, 65 | policy_optimizer, 66 | env.action_space, 67 | target=PolyakTarget(polyak_rate), 68 | ) 69 | 70 | if use_apex: 71 | enable_apex() 72 | set_n_step(n_step=n_step, discount_factor=discount_factor) 73 | replay_buffer = ExperienceReplayBuffer( 74 | replay_buffer_size, env, 75 | prioritized=prioritized or use_apex) 76 | set_replay_buffer(replay_buffer) 77 | 78 | return DDPG( 79 | q, 80 | policy, 81 | noise=noise, 82 | replay_start_size=replay_start_size, 83 | discount_factor=discount_factor, 84 | minibatch_size=minibatch_size, 85 | ) 86 | return _ddpg 87 | 88 | 89 | __all__ = ["ddpg"] 90 | -------------------------------------------------------------------------------- /rlil/presets/continuous/gail.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Adam 3 | from rlil.agents import GAIL 4 | from rlil.initializer import get_device, set_replay_buffer, get_replay_buffer 5 | from .models import fc_discriminator 6 | from rlil.approximation import Discriminator 7 | from rlil.memory import ExperienceReplayBuffer, GailWrapper 8 | 9 | 10 | def gail( 11 | transitions=None, 12 | base_agent_fn=None, 13 | # Adam optimizer settings 14 | lr_d=2e-4, 15 | # Training settings 16 | minibatch_size=512, 17 | update_frequency=1, 18 | # Replay Buffer settings 19 | replay_start_size=5000, 20 | replay_buffer_size=1e6 21 | ): 22 | """ 23 | Generative Adversarial Imitation Learning (GAIL) control preset 24 | 25 | Args: 26 | transitions: 27 | dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 28 | base_agent_fn (function): 29 | A function generated by a preset of an agent such as sac, td3, ddpg 30 | lr_d (float): Learning rate for the discriminator network. 31 | update_frequency (int): Number of base_agent update per discriminator update. 32 | minibatch_size (int): Number of experiences to sample in each discriminator update. 33 | replay_start_size (int): Number of experiences in replay buffer when training begins. 34 | replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. 35 | """ 36 | def _gail(env): 37 | device = get_device() 38 | 39 | base_agent = base_agent_fn(env) 40 | 41 | discriminator_model = fc_discriminator(env).to(device) 42 | discriminator_optimizer = Adam(discriminator_model.parameters(), 43 | lr=lr_d) 44 | discriminator = Discriminator(discriminator_model, 45 | discriminator_optimizer) 46 | 47 | expert_replay_buffer = ExperienceReplayBuffer(1e7, env) 48 | if transitions is not None: 49 | samples = expert_replay_buffer.samples_from_cpprb( 50 | transitions, device="cpu") 51 | expert_replay_buffer.store(samples) 52 | 53 | replay_buffer = get_replay_buffer() 54 | replay_buffer = GailWrapper(replay_buffer, 55 | expert_replay_buffer, 56 | discriminator) 57 | set_replay_buffer(replay_buffer) 58 | 59 | # replace base_agent's replay_buffer with gail_buffer 60 | base_agent.replay_buffer = replay_buffer 61 | 62 | return GAIL( 63 | base_agent=base_agent, 64 | minibatch_size=minibatch_size, 65 | replay_start_size=replay_start_size, 66 | update_frequency=update_frequency 67 | ) 68 | return _gail 69 | 70 | 71 | __all__ = ["gail"] 72 | -------------------------------------------------------------------------------- /rlil/presets/continuous/noisy_td3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Adam 3 | from rlil.agents import NoisyTD3 4 | from rlil.approximation import QContinuous, PolyakTarget 5 | from rlil.policies import DeterministicPolicy 6 | from rlil.memory import ExperienceReplayBuffer 7 | from rlil.initializer import (get_device, 8 | set_replay_buffer, 9 | disable_on_policy_mode, 10 | set_n_step) 11 | from .models import fc_q, fc_deterministic_noisy_policy 12 | 13 | 14 | def noisy_td3( 15 | # Common settings 16 | discount_factor=0.99, 17 | # Adam optimizer settings 18 | lr_q=1e-3, 19 | lr_pi=1e-3, 20 | # Training settings 21 | minibatch_size=512, 22 | polyak_rate=0.005, 23 | noise_td3=0.2, 24 | policy_update_td3=2, 25 | # Replay Buffer settings 26 | replay_start_size=5000, 27 | replay_buffer_size=1e7, 28 | n_step=1 29 | ): 30 | """ 31 | TD3 continuous control preset. 32 | 33 | Args: 34 | discount_factor (float): Discount factor for future rewards. 35 | lr_q (float): Learning rate for the Q network. 36 | lr_pi (float): Learning rate for the policy network. 37 | minibatch_size (int): Number of experiences to sample in each training update. 38 | polyak_rate (float): Speed with which to update the target network towards the online network. 39 | noise_td3 (float): the amount of noise to add to each action in trick three. 40 | policy_update_td3 (int): Number of timesteps per training update the policy in trick two. 41 | replay_start_size (int): Number of experiences in replay buffer when training begins. 42 | replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. 43 | n_step (int): Number of steps for N step experience replay. 44 | """ 45 | def _noisy_td3(env): 46 | disable_on_policy_mode() 47 | 48 | device = get_device() 49 | q_1_model = fc_q(env).to(device) 50 | q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q) 51 | q_1 = QContinuous( 52 | q_1_model, 53 | q_1_optimizer, 54 | target=PolyakTarget(polyak_rate), 55 | name='q_1' 56 | ) 57 | 58 | q_2_model = fc_q(env).to(device) 59 | q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q) 60 | q_2 = QContinuous( 61 | q_2_model, 62 | q_2_optimizer, 63 | target=PolyakTarget(polyak_rate), 64 | name='q_2' 65 | ) 66 | 67 | policy_model = fc_deterministic_noisy_policy(env).to(device) 68 | policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) 69 | policy = DeterministicPolicy( 70 | policy_model, 71 | policy_optimizer, 72 | env.action_space, 73 | target=PolyakTarget(polyak_rate), 74 | ) 75 | 76 | set_n_step(n_step=n_step, discount_factor=discount_factor) 77 | replay_buffer = ExperienceReplayBuffer(replay_buffer_size, env) 78 | set_replay_buffer(replay_buffer) 79 | 80 | return NoisyTD3( 81 | q_1, 82 | q_2, 83 | policy, 84 | noise_td3=noise_td3, 85 | policy_update_td3=policy_update_td3, 86 | replay_start_size=replay_start_size, 87 | discount_factor=discount_factor, 88 | minibatch_size=minibatch_size 89 | ) 90 | return _noisy_td3 91 | 92 | 93 | __all__ = ["noisy_td3"] 94 | -------------------------------------------------------------------------------- /rlil/presets/continuous/ppo.py: -------------------------------------------------------------------------------- 1 | from torch.optim import Adam 2 | from rlil.agents import PPO 3 | from rlil.approximation import VNetwork, FeatureNetwork, Approximation 4 | from rlil.policies import GaussianPolicy 5 | from rlil.memory import ExperienceReplayBuffer, GaeWrapper 6 | from rlil.initializer import (get_writer, 7 | get_device, 8 | set_replay_buffer, 9 | enable_on_policy_mode) 10 | from .models import fc_actor_critic 11 | 12 | 13 | def ppo( 14 | # Common settings 15 | discount_factor=0.98, 16 | # Adam optimizer settings 17 | lr=3e-4, # Adam learning rate 18 | eps=1e-5, # Adam stability 19 | # Loss scaling 20 | entropy_loss_scaling=0.0, 21 | value_loss_scaling=0.5, 22 | # Replay Buffer settings 23 | replay_start_size=5000, 24 | # Training settings 25 | clip_grad=0.5, 26 | epsilon=0.2, 27 | minibatches=4, 28 | epochs=2, 29 | # GAE settings 30 | lam=0.95, 31 | ): 32 | """ 33 | PPO continuous control preset. 34 | 35 | Args: 36 | discount_factor (float): Discount factor for future rewards. 37 | lr (float): Learning rate for the Adam optimizer. 38 | eps (float): Stability parameters for the Adam optimizer. 39 | entropy_loss_scaling (float): 40 | Coefficient for the entropy term in the total loss. 41 | value_loss_scaling (float): Coefficient for the value function loss. 42 | replay_start_size (int): Number of experiences in replay buffer when training begins. 43 | clip_grad (float): 44 | The maximum magnitude of the gradient for any given parameter. 45 | Set to 0 to disable. 46 | epsilon (float): 47 | Epsilon value in the clipped PPO objective function. 48 | minibatches (int): The number of minibatches to split each batch into. 49 | lam (float): The Generalized Advantage Estimate (GAE) decay parameter. 50 | """ 51 | def _ppo(env): 52 | enable_on_policy_mode() 53 | 54 | device = get_device() 55 | feature_model, value_model, policy_model = fc_actor_critic(env) 56 | feature_model.to(device) 57 | value_model.to(device) 58 | policy_model.to(device) 59 | 60 | feature_optimizer = Adam( 61 | feature_model.parameters(), lr=lr, eps=eps 62 | ) 63 | value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) 64 | policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) 65 | 66 | feature_nw = FeatureNetwork( 67 | feature_model, 68 | feature_optimizer, 69 | clip_grad=clip_grad, 70 | ) 71 | v = VNetwork( 72 | value_model, 73 | value_optimizer, 74 | loss_scaling=value_loss_scaling, 75 | clip_grad=clip_grad, 76 | ) 77 | policy = GaussianPolicy( 78 | policy_model, 79 | policy_optimizer, 80 | env.action_space, 81 | clip_grad=clip_grad, 82 | ) 83 | 84 | replay_buffer = ExperienceReplayBuffer(1e7, env) 85 | replay_buffer = GaeWrapper(replay_buffer, discount_factor, lam) 86 | set_replay_buffer(replay_buffer) 87 | 88 | return PPO( 89 | feature_nw, 90 | v, 91 | policy, 92 | epsilon=epsilon, 93 | replay_start_size=replay_start_size, 94 | minibatches=minibatches, 95 | entropy_loss_scaling=entropy_loss_scaling, 96 | ) 97 | 98 | return _ppo 99 | 100 | 101 | __all__ = ["ppo"] 102 | -------------------------------------------------------------------------------- /rlil/presets/continuous/rs_mpc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Adam 3 | from rlil.agents import RsMPC 4 | from rlil.initializer import (get_device, 5 | set_replay_buffer, 6 | disable_on_policy_mode) 7 | from rlil.approximation import Dynamics 8 | from rlil.memory import ExperienceReplayBuffer 9 | from rlil.environments import REWARDS 10 | from .models import fc_dynamics 11 | 12 | 13 | def rs_mpc( 14 | horizon=20, 15 | num_samples=1000, 16 | # Adam optimizer settings 17 | lr_dyn=1e-3, 18 | # Training settings 19 | minibatch_size=100, 20 | # Replay Buffer settings 21 | replay_start_size=5000, 22 | replay_buffer_size=1e7, 23 | ): 24 | """ 25 | Rnadom shooting MPC (RsMPC) control preset 26 | 27 | Args: 28 | horizon (int): Control horizon. 29 | num_samples (int): Number of action samples for random shooting. 30 | lr_dyn (float): Learning rate for the dynamics network. 31 | minibatch_size (int): Number of experiences to sample in each training update. 32 | replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. 33 | """ 34 | def _rs_mpc(env): 35 | assert env.name in REWARDS, \ 36 | "The reward function of {} is not registered in rlil.environments.reward_fns." 37 | reward_fn = REWARDS[env.name]() 38 | 39 | disable_on_policy_mode() 40 | device = get_device() 41 | 42 | dynamics_model = fc_dynamics(env).to(device) 43 | dynamics_optimizer = Adam(dynamics_model.parameters(), lr=lr_dyn) 44 | dynamics = Dynamics( 45 | dynamics_model, 46 | dynamics_optimizer, 47 | ) 48 | 49 | replay_buffer = ExperienceReplayBuffer(replay_buffer_size, env) 50 | set_replay_buffer(replay_buffer) 51 | 52 | return RsMPC( 53 | dynamics=dynamics, 54 | reward_fn=reward_fn, 55 | horizon=horizon, 56 | num_samples=num_samples, 57 | minibatch_size=minibatch_size, 58 | replay_start_size=replay_start_size 59 | ) 60 | return _rs_mpc 61 | 62 | 63 | __all__ = ["rs_mpc"] 64 | -------------------------------------------------------------------------------- /rlil/presets/continuous/sqil.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from rlil.initializer import set_replay_buffer, get_replay_buffer 3 | from rlil.memory import ExperienceReplayBuffer, SqilWrapper 4 | 5 | 6 | def sqil( 7 | transitions=None, 8 | base_agent_fn=None, 9 | # Replay Buffer settings 10 | replay_start_size=5000, 11 | replay_buffer_size=1e7 12 | ): 13 | """ 14 | Soft Q Imitation Learning (SQIL) control preset 15 | 16 | Args: 17 | transitions: 18 | dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 19 | base_agent_fn (function): 20 | A function generated by a preset of an agent such as sac, td3, ddpg 21 | replay_start_size (int): Number of experiences in replay buffer when training begins. 22 | replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. 23 | """ 24 | def _sqil(env): 25 | base_agent = base_agent_fn(env) 26 | expert_replay_buffer = ExperienceReplayBuffer(1e7, env) 27 | if transitions is not None: 28 | samples = expert_replay_buffer.samples_from_cpprb( 29 | transitions, device="cpu") 30 | expert_replay_buffer.store(samples) 31 | 32 | replay_buffer = get_replay_buffer() 33 | replay_buffer = SqilWrapper(replay_buffer, 34 | expert_replay_buffer) 35 | set_replay_buffer(replay_buffer) 36 | # replace base_agent's replay_buffer with gail_buffer 37 | base_agent.replay_buffer = replay_buffer 38 | 39 | return base_agent 40 | return _sqil 41 | 42 | 43 | __all__ = ["gail"] 44 | -------------------------------------------------------------------------------- /rlil/presets/continuous/td3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Adam 3 | from rlil.agents import TD3 4 | from rlil.approximation import QContinuous, PolyakTarget 5 | from rlil.policies import DeterministicPolicy 6 | from rlil.memory import ExperienceReplayBuffer 7 | from rlil.initializer import (get_device, 8 | set_replay_buffer, 9 | disable_on_policy_mode, 10 | set_n_step, 11 | enable_apex) 12 | from .models import fc_q, fc_deterministic_policy 13 | 14 | 15 | def td3( 16 | # Common settings 17 | discount_factor=0.99, 18 | # Adam optimizer settings 19 | lr_q=1e-3, 20 | lr_pi=1e-3, 21 | # Training settings 22 | minibatch_size=512, 23 | polyak_rate=0.005, 24 | noise_td3=0.2, 25 | policy_update_td3=2, 26 | # Replay Buffer settings 27 | replay_start_size=5000, 28 | replay_buffer_size=1e7, 29 | prioritized=False, 30 | use_apex=False, 31 | n_step=1, 32 | # Exploration settings 33 | noise_policy=0.1, 34 | ): 35 | """ 36 | TD3 continuous control preset. 37 | 38 | Args: 39 | discount_factor (float): Discount factor for future rewards. 40 | lr_q (float): Learning rate for the Q network. 41 | lr_pi (float): Learning rate for the policy network. 42 | minibatch_size (int): Number of experiences to sample in each training update. 43 | polyak_rate (float): Speed with which to update the target network towards the online network. 44 | noise_td3 (float): the amount of noise to add to each action in trick three. 45 | policy_update_td3 (int): Number of timesteps per training update the policy in trick two. 46 | replay_start_size (int): Number of experiences in replay buffer when training begins. 47 | replay_buffer_size (int): Maximum number of experiences to store in the replay buffer. 48 | prioritized (bool): Use prioritized experience replay if True. 49 | use_apex (bool): Use apex if True. 50 | n_step (int): Number of steps for N step experience replay. 51 | noise_policy (float): The amount of exploration noise to add. 52 | """ 53 | def _td3(env): 54 | disable_on_policy_mode() 55 | 56 | device = get_device() 57 | q_1_model = fc_q(env).to(device) 58 | q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q) 59 | q_1 = QContinuous( 60 | q_1_model, 61 | q_1_optimizer, 62 | target=PolyakTarget(polyak_rate), 63 | name='q_1' 64 | ) 65 | 66 | q_2_model = fc_q(env).to(device) 67 | q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q) 68 | q_2 = QContinuous( 69 | q_2_model, 70 | q_2_optimizer, 71 | target=PolyakTarget(polyak_rate), 72 | name='q_2' 73 | ) 74 | 75 | policy_model = fc_deterministic_policy(env).to(device) 76 | policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi) 77 | policy = DeterministicPolicy( 78 | policy_model, 79 | policy_optimizer, 80 | env.action_space, 81 | target=PolyakTarget(polyak_rate), 82 | ) 83 | 84 | if use_apex: 85 | enable_apex() 86 | set_n_step(n_step=n_step, discount_factor=discount_factor) 87 | replay_buffer = ExperienceReplayBuffer( 88 | replay_buffer_size, env, 89 | prioritized=prioritized or use_apex) 90 | set_replay_buffer(replay_buffer) 91 | 92 | return TD3( 93 | q_1, 94 | q_2, 95 | policy, 96 | noise_policy=noise_policy, 97 | noise_td3=noise_td3, 98 | policy_update_td3=policy_update_td3, 99 | replay_start_size=replay_start_size, 100 | discount_factor=discount_factor, 101 | minibatch_size=minibatch_size 102 | ) 103 | return _td3 104 | 105 | 106 | __all__ = ["td3"] 107 | -------------------------------------------------------------------------------- /rlil/presets/continuous/vac.py: -------------------------------------------------------------------------------- 1 | from torch.optim import Adam 2 | from rlil.agents import VAC 3 | from rlil.approximation import VNetwork, FeatureNetwork, Approximation 4 | from rlil.policies import GaussianPolicy 5 | from rlil.memory import ExperienceReplayBuffer 6 | from rlil.initializer import (get_writer, 7 | get_device, 8 | set_replay_buffer, 9 | enable_on_policy_mode) 10 | from .models import fc_actor_critic 11 | 12 | 13 | def vac( 14 | # Common settings 15 | discount_factor=0.98, 16 | # Adam optimizer settings 17 | lr=3e-4, # Adam learning rate 18 | eps=1e-5, # Adam stability 19 | # Loss scaling 20 | value_loss_scaling=0.5, 21 | # Replay Buffer settings 22 | replay_start_size=500, 23 | # Training settings 24 | clip_grad=0.5, 25 | ): 26 | """ 27 | VAC continuous control preset. 28 | 29 | Args: 30 | discount_factor (float): Discount factor for future rewards. 31 | lr (float): Learning rate for the Adam optimizer. 32 | eps (float): Stability parameters for the Adam optimizer. 33 | entropy_loss_scaling (float): 34 | Coefficient for the entropy term in the total loss. 35 | value_loss_scaling (float): Coefficient for the value function loss. 36 | replay_start_size (int): Number of experiences in replay buffer when training begins. 37 | clip_grad (float): 38 | The maximum magnitude of the gradient for any given parameter. 39 | Set to 0 to disable. 40 | """ 41 | def _vac(env): 42 | enable_on_policy_mode() 43 | 44 | device = get_device() 45 | feature_model, value_model, policy_model = fc_actor_critic(env) 46 | feature_model.to(device) 47 | value_model.to(device) 48 | policy_model.to(device) 49 | 50 | feature_optimizer = Adam( 51 | feature_model.parameters(), lr=lr, eps=eps 52 | ) 53 | value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) 54 | policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) 55 | 56 | feature_nw = FeatureNetwork( 57 | feature_model, 58 | feature_optimizer, 59 | clip_grad=clip_grad, 60 | ) 61 | v = VNetwork( 62 | value_model, 63 | value_optimizer, 64 | loss_scaling=value_loss_scaling, 65 | clip_grad=clip_grad, 66 | ) 67 | policy = GaussianPolicy( 68 | policy_model, 69 | policy_optimizer, 70 | env.action_space, 71 | clip_grad=clip_grad, 72 | ) 73 | 74 | replay_buffer = ExperienceReplayBuffer(1e7, env) 75 | set_replay_buffer(replay_buffer) 76 | 77 | return VAC( 78 | feature_nw, 79 | v, 80 | policy, 81 | discount_factor=discount_factor, 82 | replay_start_size=replay_start_size, 83 | ) 84 | 85 | return _vac 86 | 87 | 88 | __all__ = ["vac"] 89 | -------------------------------------------------------------------------------- /rlil/presets/continuous/vae_bc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim import Adam 3 | from rlil.agents import VaeBC 4 | from rlil.approximation import (BcqEncoder, 5 | BcqDecoder) 6 | from rlil.memory import ExperienceReplayBuffer 7 | from rlil.initializer import (get_device, 8 | set_replay_buffer, 9 | disable_on_policy_mode) 10 | from .models import (fc_bcq_encoder, 11 | fc_bcq_decoder) 12 | 13 | 14 | def vae_bc( 15 | transitions=None, 16 | # Adam optimizer settings 17 | lr_enc=1e-3, 18 | lr_dec=1e-3, 19 | # Training settings 20 | minibatch_size=100, 21 | ): 22 | """ 23 | VAE Behavioral Cloning (VAE-BC) control preset 24 | 25 | Args: 26 | transitions: 27 | dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 28 | lr_enc (float): Learning rate for the encoder. 29 | lr_dec (float): Learning rate for the decoder. 30 | minibatch_size (int): Number of experiences to sample in each training update. 31 | """ 32 | def _vae_bc(env): 33 | disable_on_policy_mode() 34 | device = get_device() 35 | 36 | latent_dim = env.action_space.shape[0] * 2 37 | 38 | encoder_model = fc_bcq_encoder(env, latent_dim=latent_dim).to(device) 39 | encoder_optimizer = Adam(encoder_model.parameters(), lr=lr_enc) 40 | encoder = BcqEncoder( 41 | model=encoder_model, 42 | latent_dim=latent_dim, 43 | optimizer=encoder_optimizer, 44 | name="encoder", 45 | ) 46 | decoder_model = fc_bcq_decoder(env, latent_dim=latent_dim).to(device) 47 | decoder_optimizer = Adam(decoder_model.parameters(), lr=lr_dec) 48 | decoder = BcqDecoder( 49 | model=decoder_model, 50 | latent_dim=latent_dim, 51 | space=env.action_space, 52 | optimizer=decoder_optimizer, 53 | name="decoder", 54 | ) 55 | 56 | replay_buffer = ExperienceReplayBuffer(1e7, env) 57 | if transitions is not None: 58 | samples = replay_buffer.samples_from_cpprb( 59 | transitions, device="cpu") 60 | replay_buffer.store(samples) 61 | set_replay_buffer(replay_buffer) 62 | 63 | return VaeBC( 64 | encoder=encoder, 65 | decoder=decoder, 66 | minibatch_size=minibatch_size, 67 | ) 68 | return _vae_bc 69 | 70 | 71 | __all__ = ["vae_bc"] 72 | -------------------------------------------------------------------------------- /rlil/presets/validate_agent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import ray 4 | from rlil.environments import State 5 | from rlil.initializer import is_on_policy_mode 6 | from rlil.samplers import AsyncSampler 7 | from rlil.experiments import Trainer 8 | 9 | 10 | def env_validation(agent_fn, env, done_step=-1): 11 | """ 12 | Args: 13 | agent_fn (func): presets of the agent 14 | env (rlil.GymEnvironment) 15 | done_step (optional): 16 | Run until the step reaches done_step. 17 | If less than 0, run until env.done == True. 18 | """ 19 | 20 | agent = agent_fn(env) 21 | num_trains = 0 22 | 23 | for _ in range(2): 24 | env.reset() 25 | done_flag = False 26 | step = 0 27 | while not done_flag: 28 | num_trains += agent.should_train() 29 | if not is_on_policy_mode(): 30 | agent.train() 31 | env.step(agent.act(env.state, env.reward)) 32 | step += 1 33 | if done_step < 0: 34 | done_flag = env.done 35 | else: 36 | done_flag = done_step < step 37 | num_trains += agent.should_train() 38 | agent.train() 39 | agent.act(env.state, env.reward) 40 | 41 | assert num_trains > 0 42 | 43 | 44 | def trainer_validation(agent_fn, env, apex=False): 45 | agent = agent_fn(env) 46 | lazy_agent = agent.make_lazy_agent() 47 | eval_lazy_agent = agent.make_lazy_agent(evaluation=True) 48 | lazy_agent.set_replay_buffer(env) 49 | eval_lazy_agent.set_replay_buffer(env) 50 | 51 | env.reset() 52 | action = lazy_agent.act(env.state, env.reward) 53 | 54 | while not env.done: 55 | env.step(action) 56 | action = lazy_agent.act(env.state, env.reward) 57 | _ = eval_lazy_agent.act(env.state, env.reward) 58 | 59 | lazy_agent.replay_buffer.on_episode_end() 60 | 61 | samples = lazy_agent.replay_buffer.get_all_transitions() 62 | samples.weights = lazy_agent.compute_priorities(samples) 63 | if apex: 64 | assert samples.weights is not None 65 | agent.replay_buffer.store(samples) 66 | agent.train() 67 | agent.train() 68 | assert agent.writer.train_steps > 1 69 | -------------------------------------------------------------------------------- /rlil/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from rlil.samplers.base import Sampler 2 | from rlil.samplers.asyncsampler import AsyncSampler, StartInfo 3 | 4 | __all__ = ["Sampler", "AsyncSampler", "StartInfo"] 5 | -------------------------------------------------------------------------------- /rlil/samplers/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class Sampler(ABC): 5 | """ 6 | Abstract sampler class 7 | """ 8 | 9 | @abstractmethod 10 | def start_sampling(self, agent, worker_frames, worker_episodes): 11 | """ 12 | Start sampling until it reaches worker_frames or worker_episodes. 13 | 14 | Args: 15 | agent (rlil.agent): Agent to collect samples 16 | worker_frames (int): worker stops to sample when it collects worker_frames 17 | worker_episodes (int): worker stops to sample when it reaches worker_episodes 18 | """ 19 | 20 | @abstractmethod 21 | def store_samples(self): 22 | """ 23 | Store collected samples to the replay_buffer 24 | 25 | Returns: 26 | result (dict): Information of sampling (e.g. stored frames, returns, etc) 27 | """ 28 | -------------------------------------------------------------------------------- /rlil/utils/__init__.py: -------------------------------------------------------------------------------- 1 | class Samples: 2 | def __init__(self, states=None, actions=None, rewards=None, 3 | next_states=None, weights=None, indexes=None): 4 | self.states = states 5 | self.actions = actions 6 | self.rewards = rewards 7 | self.next_states = next_states 8 | self.weights = weights 9 | self.indexes = indexes 10 | self._keys = [self.states, self.actions, self.rewards, 11 | self.next_states, self.weights, self.indexes] 12 | 13 | def __iter__(self): 14 | return iter(self._keys) 15 | 16 | 17 | def samples_to_np(samples): 18 | np_states, np_dones = samples.states.raw_numpy() 19 | np_actions = samples.actions.raw_numpy() 20 | np_rewards = samples.rewards.detach().cpu().numpy() 21 | np_next_states, np_next_dones = samples.next_states.raw_numpy() 22 | return np_states, np_rewards, np_actions, np_next_states, \ 23 | np_dones, np_next_dones 24 | -------------------------------------------------------------------------------- /runs/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/continuous/offline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pybullet 3 | import pybullet_envs 4 | from rlil.environments import GymEnvironment, ENVS 5 | from rlil.experiments import Experiment 6 | from rlil.presets import get_default_args 7 | from rlil.presets import continuous 8 | from rlil.initializer import get_logger, set_device, set_seed, get_writer 9 | import torch 10 | import logging 11 | import ray 12 | import pickle 13 | import os 14 | import shutil 15 | 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser( 19 | description="Run an offline continuous actions benchmark.") 20 | parser.add_argument("env", help="Name of the env") 21 | parser.add_argument("agent", 22 | help="Name of the agent (e.g. bc). See presets for available agents.") 23 | parser.add_argument("dir", 24 | help="Directory where the transitions.pkl is saved.") 25 | parser.add_argument("--device", default="cuda", 26 | help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)") 27 | parser.add_argument("--seed", type=int, default=0, 28 | help="Random seed") 29 | parser.add_argument("--train_minutes", type=int, default=30, 30 | help="Minutes to train.") 31 | parser.add_argument("--num_workers_eval", type=int, 32 | default=1, help="Number of workers for evaluation") 33 | parser.add_argument("--exp_info", default="default experiment", 34 | help="One line descriptions of the experiment. \ 35 | Experiments' results are saved in 'runs/[exp_info]/[env_id]/'") 36 | 37 | args = parser.parse_args() 38 | 39 | # initialization 40 | ray.init(include_webui=False, ignore_reinit_error=True) 41 | set_device(torch.device(args.device)) 42 | set_seed(args.seed) 43 | logger = get_logger() 44 | logger.setLevel(logging.DEBUG) 45 | 46 | # set environment 47 | if args.env in ENVS: 48 | env_id = ENVS[args.env] 49 | else: 50 | env_id = args.env 51 | env = GymEnvironment(env_id, append_time=True) 52 | 53 | # set agent 54 | agent_name = args.agent 55 | preset = getattr(continuous, agent_name) 56 | with open(os.path.join(args.dir, "transitions.pkl"), mode='rb') as f: 57 | transitions = pickle.load(f) 58 | agent_fn = preset(transitions) 59 | 60 | # set args_dict 61 | args_dict = get_default_args(preset) 62 | args_dict.update(vars(args)) 63 | 64 | Experiment( 65 | agent_fn, env, 66 | num_workers=0, 67 | num_workers_eval=args.num_workers_eval, 68 | train_minutes=args.train_minutes, 69 | args_dict=args_dict, 70 | seed=args.seed, 71 | exp_info=args.exp_info, 72 | ) 73 | 74 | # copy demo_return.json if exists 75 | demo_return_path = os.path.join(args.dir, "demo_return.json") 76 | if os.path.exists(demo_return_path): 77 | writer = get_writer() 78 | shutil.copy2(demo_return_path, writer.log_dir) 79 | 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /scripts/continuous/online.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pybullet 3 | import pybullet_envs 4 | from rlil.environments import GymEnvironment, ENVS 5 | from rlil.experiments import Experiment 6 | from rlil.presets import get_default_args, continuous 7 | from rlil.initializer import get_logger, set_device, set_seed 8 | import torch 9 | import logging 10 | import ray 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser( 15 | description="Run a continuous actions benchmark.") 16 | parser.add_argument("env", help="Name of the env") 17 | parser.add_argument("agent", 18 | help="Name of the agent (e.g. ppo). See presets for available agents.") 19 | parser.add_argument("--device", default="cuda", 20 | help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)") 21 | parser.add_argument("--seed", type=int, default=0, 22 | help="Random seed") 23 | parser.add_argument("--train_minutes", type=int, default=60, 24 | help="Minutes to train.") 25 | parser.add_argument("--num_workers", type=int, default=1, 26 | help="Number of workers for training") 27 | parser.add_argument("--exp_info", default="default experiment", 28 | help="One line descriptions of the experiment. \ 29 | Experiments' results are saved in 'runs/[exp_info]/[env_id]/'") 30 | 31 | args = parser.parse_args() 32 | 33 | # initialization 34 | ray.init(include_webui=False, ignore_reinit_error=True) 35 | set_device(torch.device(args.device)) 36 | set_seed(args.seed) 37 | logger = get_logger() 38 | logger.setLevel(logging.DEBUG) 39 | 40 | # set environment 41 | if args.env in ENVS: 42 | env_id = ENVS[args.env] 43 | else: 44 | env_id = args.env 45 | env = GymEnvironment(env_id, append_time=True) 46 | 47 | # set agent 48 | agent_name = args.agent 49 | preset = getattr(continuous, agent_name) 50 | buffer_args = {"n_step": 1, "prioritized": False, "use_apex": False} 51 | agent_fn = preset(**buffer_args) 52 | 53 | # set args_dict 54 | args_dict = get_default_args(preset) 55 | args_dict.update(vars(args)) 56 | args_dict.update(buffer_args) 57 | 58 | Experiment( 59 | agent_fn, env, 60 | num_workers=args.num_workers, 61 | train_minutes=args.train_minutes, 62 | args_dict=args_dict, 63 | seed=args.seed, 64 | exp_info=args.exp_info, 65 | ) 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /scripts/continuous/online_il.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pybullet 3 | import pybullet_envs 4 | from rlil.environments import GymEnvironment, ENVS 5 | from rlil.experiments import Experiment 6 | from rlil.presets import get_default_args 7 | from rlil.presets import continuous 8 | from rlil.initializer import get_logger, set_device, set_seed, get_writer 9 | import torch 10 | import logging 11 | import ray 12 | import pickle 13 | import os 14 | import shutil 15 | 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser( 19 | description="Run an online_il benchmark.") 20 | parser.add_argument("env", help="Name of the env") 21 | parser.add_argument("agent", 22 | help="Name of the online imitation learning agent \ 23 | (e.g. gail). See presets for available agents.") 24 | parser.add_argument("base_agent", 25 | help="Name of the base agent (e.g. ddpg). \ 26 | See presets for available agents.") 27 | parser.add_argument("dir", 28 | help="Directory where the transitions.pkl is saved.") 29 | parser.add_argument("--device", default="cuda", 30 | help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)") 31 | parser.add_argument("--seed", type=int, default=0, 32 | help="Random seed") 33 | parser.add_argument("--train_minutes", type=int, default=60, 34 | help="Minutes to train.") 35 | parser.add_argument("--trains_per_episode", type=int, default=5, 36 | help="Number of training steps per episode") 37 | parser.add_argument("--num_workers", type=int, 38 | default=1, help="Number of workers for training") 39 | parser.add_argument("--exp_info", default="default experiment", 40 | help="One line descriptions of the experiment. \ 41 | Experiments' results are saved in 'runs/[exp_info]/[env_id]/'") 42 | 43 | args = parser.parse_args() 44 | 45 | # initialization 46 | ray.init(include_webui=False, ignore_reinit_error=True) 47 | set_device(torch.device(args.device)) 48 | set_seed(args.seed) 49 | logger = get_logger() 50 | logger.setLevel(logging.DEBUG) 51 | 52 | # set environment 53 | if args.env in ENVS: 54 | env_id = ENVS[args.env] 55 | else: 56 | env_id = args.env 57 | env = GymEnvironment(env_id, append_time=True) 58 | 59 | # set base_agent 60 | base_preset = getattr(continuous, args.base_agent) 61 | base_agent_fn = base_preset() 62 | 63 | # set agent 64 | with open(os.path.join(args.dir, "transitions.pkl"), mode='rb') as f: 65 | transitions = pickle.load(f) 66 | preset = getattr(continuous, args.agent) 67 | agent_fn = preset( 68 | transitions=transitions, 69 | base_agent_fn=base_agent_fn, 70 | ) 71 | 72 | agent_name = agent_fn.__name__[1:] 73 | base_agent_name = base_agent_fn.__name__[1:] 74 | 75 | # set args_dict 76 | args_dict = {"args": {}, base_agent_name: {}, agent_name: {}} 77 | args_dict["args"] = vars(args) 78 | args_dict[base_agent_name] = get_default_args(base_preset) 79 | args_dict[agent_name] = get_default_args(preset) 80 | 81 | Experiment( 82 | agent_fn, env, 83 | agent_name=agent_name + "-" + base_agent_name, 84 | num_workers=args.num_workers, 85 | train_minutes=args.train_minutes, 86 | trains_per_episode=args.trains_per_episode, 87 | args_dict=args_dict, 88 | seed=args.seed, 89 | exp_info=args.exp_info, 90 | ) 91 | 92 | # copy demo_return.json if exists 93 | demo_return_path = os.path.join(args.dir, "demo_return.json") 94 | if os.path.exists(demo_return_path): 95 | writer = get_writer() 96 | shutil.copy2(demo_return_path, writer.log_dir) 97 | 98 | 99 | if __name__ == "__main__": 100 | main() 101 | -------------------------------------------------------------------------------- /scripts/continuous/watch_continuous.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pybullet 3 | import pybullet_envs 4 | import re 5 | import os 6 | import time 7 | from rlil.environments import GymEnvironment, ENVS 8 | from rlil.presets import continuous 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser(description="Watch a continuous agent.") 13 | parser.add_argument("env", help="Name of the env") 14 | parser.add_argument("agent", 15 | help="Name of the agent (e.g. ppo). See presets for available agents.") 16 | parser.add_argument("--train", action="store_true", 17 | help="The model of lazy_agent: evaluation or training.") 18 | parser.add_argument( 19 | "dir", help="Directory where the agent's model was saved.") 20 | parser.add_argument( 21 | "--device", 22 | default="cpu", 23 | help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", 24 | ) 25 | parser.add_argument( 26 | "--fps", 27 | default=120, 28 | help="Playback speed", 29 | ) 30 | args = parser.parse_args() 31 | 32 | # load env 33 | env = GymEnvironment(ENVS[args.env], append_time=True) 34 | 35 | # load agent 36 | agent_fn = getattr(continuous, args.agent)() 37 | agent = agent_fn(env) 38 | agent.load(args.dir) 39 | 40 | # watch 41 | watch(agent, env, fps=args.fps, eval=not args.train) 42 | 43 | 44 | def watch(agent, env, fps=60, eval=True): 45 | action = None 46 | returns = 0 47 | # have to call this before initial reset for pybullet envs 48 | if "Bullet" in env.name: 49 | env.render(mode="human") 50 | while True: 51 | time.sleep(1 / fps) 52 | if env.done: 53 | lazy_agent = agent.make_lazy_agent(evaluation=eval) 54 | lazy_agent.set_replay_buffer(env) 55 | print('returns: {}'.format(returns)) 56 | env.reset() 57 | returns = 0 58 | else: 59 | env.step(action) 60 | env.render() 61 | action = lazy_agent.act(env.state, env.reward) 62 | returns += env.reward 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /scripts/offline_continuous.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export exp_info=offline 4 | export train_minutes=120 5 | 6 | for agent in bc bcq 7 | do 8 | for seed in {0..3} 9 | do 10 | # ant 11 | tsp python ~/pytorch-rl-il/scripts/continuous/offline.py ant $agent runs/demos/AntBulletEnv-v0/td3_3000 --train_minutes $train_minutes --exp_info $exp_info --device cuda:0 --seed $seed 12 | # hopper 13 | tsp python ~/pytorch-rl-il/scripts/continuous/offline.py hopper $agent runs/demos/HopperBulletEnv-v0/sac_2000 --train_minutes $train_minutes --exp_info $exp_info --device cuda:0 --seed $seed 14 | # humanoid 15 | tsp python ~/pytorch-rl-il/scripts/continuous/offline.py humanoid $agent runs/demos/HumanoidBulletEnv-v0/td3_1700 --train_minutes $train_minutes --exp_info $exp_info --device cuda:0 --seed $seed 16 | # walker 17 | tsp python ~/pytorch-rl-il/scripts/continuous/offline.py walker $agent runs/demos/WalkerBulletEnv-v0/ppo_3000 --train_minutes $train_minutes --exp_info $exp_info --device cuda:0 --seed $seed 18 | done 19 | done -------------------------------------------------------------------------------- /scripts/online_continuous.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export exp_info=online 4 | export train_minutes=120 5 | export num_workers=8 6 | 7 | for env in ant humanoid walker lander 8 | do 9 | for agent in ppo ddpg td3 sac 10 | do 11 | tsp python ~/pytorch-rl-il/scripts/continuous/online.py $env $agent --train_minutes $train_minutes --num_workers $num_workers --exp_info $exp_info --device cuda:0 --seed 0 12 | tsp python ~/pytorch-rl-il/scripts/continuous/online.py $env $agent --train_minutes $train_minutes --num_workers $num_workers --exp_info $exp_info --device cuda:0 --seed 1 13 | tsp python ~/pytorch-rl-il/scripts/continuous/online.py $env $agent --train_minutes $train_minutes --num_workers $num_workers --exp_info $exp_info --device cuda:1 --seed 2 14 | tsp python ~/pytorch-rl-il/scripts/continuous/online.py $env $agent --train_minutes $train_minutes --num_workers $num_workers --exp_info $exp_info --device cuda:1 --seed 3 15 | done 16 | done 17 | -------------------------------------------------------------------------------- /scripts/online_il_continuous.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export exp_info=online_il 4 | export train_minutes=120 5 | export trains_per_episode=5 6 | export num_workers=8 7 | 8 | for agent in gail sqil 9 | do 10 | for base_agent in ppo sac 11 | do 12 | for seed in {0..3} 13 | do 14 | # ant 15 | tsp python ~/pytorch-rl-il/scripts/continuous/online_il.py ant $agent $base_agent runs/demos/AntBulletEnv-v0/td3_3000 --train_minutes $train_minutes --exp_info $exp_info --device cuda:0 --seed $seed 16 | # hopper 17 | tsp python ~/pytorch-rl-il/scripts/continuous/online_il.py hopper $agent $base_agent runs/demos/HopperBulletEnv-v0/sac_2000 --train_minutes $train_minutes --exp_info $exp_info --device cuda:0 --seed $seed 18 | # humanoid 19 | tsp python ~/pytorch-rl-il/scripts/continuous/online_il.py humanoid $agent $base_agent runs/demos/HumanoidBulletEnv-v0/td3_1700 --train_minutes $train_minutes --exp_info $exp_info --device cuda:1 --seed $seed 20 | # walker 21 | tsp python ~/pytorch-rl-il/scripts/continuous/online_il.py walker $agent $base_agent runs/demos/Walker2DBulletEnv-v0/ppo_3000 --train_minutes $train_minutes --exp_info $exp_info --device cuda:1 --seed $seed 22 | done 23 | done 24 | done -------------------------------------------------------------------------------- /scripts/plot.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from rlil.utils.plots import plot 3 | 4 | 5 | if __name__ == "__main__": 6 | parser = argparse.ArgumentParser( 7 | description="Plots the results of experiments.") 8 | parser.add_argument("dir", 9 | help="Experiment directory. This is a directory of exp_info, not runs/") 10 | parser.add_argument("--step", type=str, default="train_steps", 11 | help="The unit of x-axis. You can choose it from \ 12 | [sample_frames, sample_episodes, train_steps, minutes]") 13 | 14 | args = parser.parse_args() 15 | 16 | plot(args.dir, args.step) 17 | -------------------------------------------------------------------------------- /scripts/record_trajectory.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pybullet 3 | import pybullet_envs 4 | import os 5 | import time 6 | import pickle 7 | import json 8 | import numpy as np 9 | import ray 10 | from rlil.memory import ExperienceReplayBuffer 11 | from rlil.initializer import set_replay_buffer, get_replay_buffer 12 | from rlil.samplers import AsyncSampler 13 | from rlil.environments import GymEnvironment 14 | from rlil.presets import continuous 15 | 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser(description="Record a trajectory of trained agent. \ 19 | The trajectory will be stored as transitions.pkl in the args.dir.") 20 | parser.add_argument( 21 | "dir", help="Directory where the agent's model is saved.") 22 | parser.add_argument( 23 | "--device", 24 | default="cpu", 25 | help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", 26 | ) 27 | parser.add_argument("--train", action="store_true", 28 | help="The model of lazy_agent: evaluation or training.") 29 | parser.add_argument("--num_workers", type=int, default=1, 30 | help="Number of workers for training") 31 | parser.add_argument("--frames", type=int, default=1e6, 32 | help="Number of frames to store") 33 | 34 | args = parser.parse_args() 35 | ray.init(include_webui=False, ignore_reinit_error=True) 36 | 37 | # load env 38 | if args.dir[-1] != "/": 39 | args.dir += "/" 40 | env_id = args.dir.split("/")[-3] 41 | env = GymEnvironment(env_id, append_time=True) 42 | 43 | # load agent 44 | agent_name = os.path.basename( 45 | os.path.dirname(args.dir)).split("_")[0] 46 | agent_fn = getattr(continuous, agent_name)() 47 | agent = agent_fn(env) 48 | agent.load(args.dir) 49 | lazy_agent = agent.make_lazy_agent( 50 | evaluation=not args.train, store_samples=True) 51 | 52 | # reset ExperienceReplayBuffer 53 | set_replay_buffer(ExperienceReplayBuffer(args.frames + 10, env)) 54 | 55 | # set sampler 56 | sampler = AsyncSampler(env, num_workers=args.num_workers) 57 | 58 | # start recording 59 | replay_buffer = get_replay_buffer() 60 | 61 | returns = [] 62 | while len(replay_buffer) < args.frames: 63 | sampler.start_sampling( 64 | lazy_agent, worker_episodes=1) 65 | 66 | sample_result = sampler.store_samples(timeout=1) 67 | for sample_info in sample_result.values(): 68 | returns += sample_info["returns"] 69 | 70 | # save return info of the policy 71 | returns_dict = {"mean": np.mean(returns), "std": np.std(returns)} 72 | filepath = os.path.join(args.dir, 'demo_return.json') 73 | with open(filepath, mode='w') as f: 74 | json.dump(returns_dict, f) 75 | 76 | # save replay buffer 77 | filepath = os.path.join(args.dir, 'transitions.pkl') 78 | with open(filepath, mode='wb') as f: 79 | samples = replay_buffer.get_all_transitions(return_cpprb=True) 80 | pickle.dump(samples, f) 81 | 82 | print("Transitions (size: {}) is saved at {}".format( 83 | len(replay_buffer), filepath)) 84 | 85 | 86 | if __name__ == "__main__": 87 | main() 88 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="pytorch-rl-il", 5 | version="0.0.1", 6 | description=( 7 | "A library for building reinforcement learning and imitation learning agents in Pytorch"), 8 | packages=find_packages(), 9 | url="https://github.com/syuntoku14/pytorch-rl-il", 10 | author="Toshinori Kitamura", 11 | author_email="syuntoku14@gmail.com", 12 | install_requires=[ 13 | "gym[atari,box2d]", # atari environments 14 | "numpy", # math library 15 | "matplotlib", # plotting library 16 | "seaborn", # plotting library 17 | "pandas", 18 | "opencv-python", # used by atari wrappers 19 | "pybullet", # continuous environments 20 | "autopep8", # code quality tool 21 | "torch-testing", # testing library for pytorch 22 | "ray", # multiprocessing tool 23 | "pytest", # python testing library 24 | "cpprb", # fast replay buffer library 25 | "pytest-benchmark", 26 | "gitpython" 27 | # these should be installed globally: 28 | # "tensorflow", # needed for tensorboard 29 | # "torch", # deep learning library 30 | # "torchvision", # install alongside pytorch 31 | ], 32 | extras_require={ 33 | "pytorch": [ 34 | "torch", 35 | "torchvision", 36 | "tensorboard" 37 | ], 38 | "docs": [ 39 | "sphinx", 40 | "sphinx-autobuild", 41 | "sphinx-rtd-theme", 42 | "sphinx-automodapi" 43 | ] 44 | }, 45 | ) 46 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/__init__.py -------------------------------------------------------------------------------- /tests/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/agents/__init__.py -------------------------------------------------------------------------------- /tests/approximation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/approximation/__init__.py -------------------------------------------------------------------------------- /tests/approximation/bcq_encoder_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch_testing as tt 4 | from torch.nn.functional import mse_loss 5 | from rlil import nn 6 | from rlil.approximation.bcq_auto_encoder import BcqEncoder, BcqDecoder 7 | from rlil.environments import State, Action, GymEnvironment 8 | from rlil.presets.continuous.models import fc_bcq_encoder, fc_bcq_decoder 9 | import numpy as np 10 | 11 | 12 | # Test the network architecture of 13 | # https://github.com/sfujim/BCQ/blob/05c07fc442a2be96f6249b966682cf065045500f/BCQ.py 14 | @pytest.fixture 15 | def setUp(): 16 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 17 | Action.set_action_space(env.action_space) 18 | latent_dim = 32 19 | num_samples = 5 20 | encoder_model = fc_bcq_encoder(env, latent_dim=latent_dim) 21 | decoder_model = fc_bcq_decoder(env, latent_dim=latent_dim) 22 | 23 | encoder_optimizer = torch.optim.SGD(encoder_model.parameters(), lr=0.1) 24 | decoder_optimizer = torch.optim.SGD(decoder_model.parameters(), lr=0.1) 25 | encoder = BcqEncoder(model=encoder_model, 26 | latent_dim=latent_dim, 27 | optimizer=encoder_optimizer) 28 | decoder = BcqDecoder(model=decoder_model, 29 | latent_dim=latent_dim, 30 | space=env.action_space, 31 | optimizer=decoder_optimizer) 32 | sample_states = State.from_list([env.reset() for _ in range(num_samples)]) 33 | sample_actions = Action( 34 | torch.tensor([env.action_space.sample() for _ in range(num_samples)])) 35 | 36 | yield encoder, decoder, sample_states, sample_actions 37 | 38 | 39 | def test_decode(setUp): 40 | encoder, decoder, states, actions = setUp 41 | mean, log_var = encoder(states, actions) 42 | z = mean + (0.5 * log_var).exp() * torch.randn_like(log_var) 43 | dec = decoder(states, z) 44 | assert actions.shape == dec.shape 45 | 46 | 47 | def test_decode_multiple(setUp): 48 | encoder, decoder, states, actions = setUp 49 | dec = decoder.decode_multiple(states, 10) 50 | assert (actions.shape[0], 10, actions.shape[-1]) == dec[0].shape 51 | 52 | 53 | def test_reinforce(setUp): 54 | encoder, decoder, states, actions = setUp 55 | mean, log_var = encoder(states, actions) 56 | # reinforce mse 57 | z = mean + (0.5 * log_var).exp() * torch.randn_like(log_var) 58 | dec = decoder(states, z) 59 | loss = mse_loss(actions.features, dec) 60 | 61 | for _ in range(100): 62 | mean, log_var = encoder(states, actions) 63 | z = mean + log_var.exp() * torch.randn_like(log_var) 64 | dec = decoder(states, z) 65 | new_loss = mse_loss(actions.features, dec) 66 | decoder.reinforce(new_loss) 67 | encoder.reinforce() 68 | assert new_loss < loss 69 | 70 | # reinforce mse 71 | z = mean + (0.5 * log_var).exp() * torch.randn_like(log_var) 72 | dec = decoder(states, z) 73 | loss = nn.kl_loss_vae(mean, log_var) 74 | 75 | for _ in range(10): 76 | mean, log_var = encoder(states, actions) 77 | z = mean + log_var.exp() * torch.randn_like(log_var) 78 | dec = decoder(states, z) 79 | new_loss = nn.kl_loss_vae(mean, log_var) 80 | decoder.reinforce(new_loss) 81 | encoder.reinforce() 82 | assert new_loss < loss 83 | -------------------------------------------------------------------------------- /tests/approximation/dynamics_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch_testing as tt 4 | from torch.nn.functional import mse_loss 5 | from rlil import nn 6 | from rlil.approximation import Dynamics 7 | from rlil.environments import State, Action, GymEnvironment 8 | from rlil.presets.continuous.models import fc_dynamics 9 | 10 | 11 | @pytest.fixture 12 | def setUp(): 13 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 14 | dynamics_model = fc_dynamics(env) 15 | 16 | dynamics_optimizer = torch.optim.Adam(dynamics_model.parameters()) 17 | dynamics = Dynamics(model=dynamics_model, 18 | optimizer=dynamics_optimizer) 19 | 20 | yield env, dynamics 21 | 22 | 23 | def test_forward(setUp): 24 | env, dynamics = setUp 25 | state = env.reset() 26 | for _ in range(10): 27 | action = Action( 28 | torch.tensor(env.action_space.sample()).unsqueeze(0)) 29 | output = dynamics(state, action) 30 | assert state.shape == output.shape 31 | tt.assert_equal(state.mask, output.mask) 32 | 33 | 34 | def test_reinforce(setUp): 35 | env, dynamics = setUp 36 | state = env.reset() 37 | action = Action( 38 | torch.tensor(env.action_space.sample()).unsqueeze(0)) 39 | output = dynamics(state, action) 40 | loss = mse_loss(state.features, output.features) 41 | for _ in range(10): 42 | output = dynamics(state, action) 43 | new_loss = mse_loss(state.features, output.features) 44 | dynamics.reinforce(new_loss) 45 | assert new_loss < loss 46 | -------------------------------------------------------------------------------- /tests/approximation/ensemble_q_continuous_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch_testing as tt 4 | from torch.nn.functional import mse_loss 5 | from rlil import nn 6 | from rlil.approximation.ensemble_q_continuous import EnsembleQContinuous 7 | from rlil.environments import State, Action, GymEnvironment 8 | from rlil.presets.continuous.models import fc_q 9 | import numpy as np 10 | 11 | 12 | @pytest.fixture 13 | def setUp(): 14 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 15 | num_qs = 3 16 | num_samples = 5 17 | q_models = nn.ModuleList([fc_q(env) for _ in range(num_qs)]) 18 | qs_optimizer = torch.optim.Adam(q_models.parameters()) 19 | qs = EnsembleQContinuous(q_models, qs_optimizer) 20 | Action.set_action_space(env.action_space) 21 | sample_states = State.from_list([env.reset() for _ in range(num_samples)]) 22 | sample_actions = Action( 23 | torch.tensor([env.action_space.sample() for _ in range(num_samples)])) 24 | 25 | yield qs, sample_states, sample_actions 26 | 27 | 28 | def test_forward(setUp): 29 | qs, states, actions = setUp 30 | q_values = qs(states, actions) 31 | assert q_values.shape == (5, 3) 32 | with pytest.raises(AssertionError): 33 | tt.assert_almost_equal(q_values[0][0], q_values[0][1]) 34 | tt.assert_almost_equal(q_values[0][0], q_values[0][2]) 35 | 36 | 37 | def test_q1(setUp): 38 | qs, states, actions = setUp 39 | q_values = qs.q1(states, actions) 40 | assert q_values.shape == (5, ) 41 | 42 | 43 | def test_reinforce(setUp): 44 | qs, states, actions = setUp 45 | q_values = qs(states, actions) 46 | qs_params = [param.data.clone() for param in qs.model.parameters()] 47 | qs.reinforce(q_values.sum()) 48 | new_qs_params = [param.data for param in qs.model.parameters()] 49 | 50 | for param, new_param in zip(qs_params, new_qs_params): 51 | with pytest.raises(AssertionError): 52 | tt.assert_almost_equal(param, new_param) 53 | -------------------------------------------------------------------------------- /tests/approximation/feature_network_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from torch import nn 4 | import torch_testing as tt 5 | from torch.optim import Adam 6 | from rlil.environments import State, GymEnvironment 7 | from rlil.presets.continuous.models import fc_actor_critic 8 | from rlil.approximation import FeatureNetwork, VNetwork 9 | from rlil.policies.gaussian import GaussianPolicy 10 | 11 | 12 | STATE_DIM = 2 13 | 14 | 15 | @pytest.fixture 16 | def setUp(): 17 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 18 | 19 | feature_model, value_model, policy_model = fc_actor_critic(env) 20 | value_optimizer = Adam(value_model.parameters()) 21 | policy_optimizer = Adam(policy_model.parameters()) 22 | feature_optimizer = Adam(feature_model.parameters()) 23 | 24 | feature_nw = FeatureNetwork(feature_model, feature_optimizer) 25 | v = VNetwork(value_model, value_optimizer) 26 | policy = GaussianPolicy(policy_model, policy_optimizer, env.action_space) 27 | 28 | states = env.reset() 29 | yield states, feature_nw, v, policy 30 | 31 | 32 | def test_share_output(setUp): 33 | states, feature_nw, v, policy = setUp 34 | 35 | states = feature_nw(states) 36 | value = v(states) 37 | action = policy(states).sample() 38 | 39 | value_loss = value.sum() 40 | policy_loss = policy(states).log_prob(action+1).sum() 41 | 42 | policy.reinforce(policy_loss) 43 | v.reinforce(value_loss) 44 | feature_nw.reinforce() 45 | 46 | 47 | def test_independent_output(setUp): 48 | states, feature_nw, v, policy = setUp 49 | 50 | v_states = feature_nw(states) 51 | p_states = feature_nw(states) 52 | value = v(v_states) 53 | action = policy(p_states).sample() 54 | 55 | value_loss = value.sum() 56 | policy_loss = policy(p_states).log_prob(action+1).sum() 57 | 58 | policy.reinforce(policy_loss) 59 | v.reinforce(value_loss) 60 | feature_nw.reinforce() 61 | -------------------------------------------------------------------------------- /tests/approximation/q_network_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | import gym 4 | from torch import nn 5 | from torch.nn.functional import smooth_l1_loss 6 | import torch_testing as tt 7 | import numpy as np 8 | from rlil.environments import State, Action 9 | from rlil.approximation import QNetwork, FixedTarget 10 | 11 | STATE_DIM = 2 12 | ACTIONS = 3 13 | action_space = gym.spaces.Discrete(10) 14 | 15 | 16 | class TestQNetwork(unittest.TestCase): 17 | def setUp(self): 18 | torch.manual_seed(2) 19 | self.model = nn.Sequential( 20 | nn.Linear(STATE_DIM, ACTIONS) 21 | ) 22 | 23 | def optimizer(params): 24 | return torch.optim.SGD(params, lr=0.1) 25 | self.q = QNetwork(self.model, optimizer) 26 | 27 | def test_eval_list(self): 28 | states = State( 29 | torch.randn(5, STATE_DIM), 30 | mask=torch.tensor([1, 1, 0, 1, 0]) 31 | ) 32 | result = self.q.eval(states) 33 | tt.assert_almost_equal( 34 | result, 35 | torch.tensor([ 36 | [-0.238509, -0.726287, -0.034026], 37 | [-0.35688755, -0.6612102, 0.34849477], 38 | [0., 0., 0.], 39 | [0.1944, -0.5536, -0.2345], 40 | [0., 0., 0.] 41 | ]), 42 | decimal=2 43 | ) 44 | 45 | def test_eval_actions(self): 46 | states = State(torch.randn(3, STATE_DIM)) 47 | Action.set_action_space(action_space) 48 | actions = Action(torch.tensor([1, 2, 0]).unsqueeze(1)) 49 | result = self.q.eval(states, actions) 50 | self.assertEqual(result.shape, torch.Size([3])) 51 | tt.assert_almost_equal(result, torch.tensor( 52 | [-0.7262873, 0.3484948, -0.0296164])) 53 | 54 | def test_target_net(self): 55 | torch.manual_seed(2) 56 | model = nn.Sequential( 57 | nn.Linear(1, 1) 58 | ) 59 | optimizer = torch.optim.SGD(model.parameters(), lr=0.1) 60 | q = QNetwork( 61 | model, 62 | optimizer, 63 | target=FixedTarget(3) 64 | ) 65 | inputs = State(torch.tensor([1.]).unsqueeze(0)) 66 | 67 | def loss(policy_value): 68 | target = policy_value - 1 69 | return smooth_l1_loss(policy_value, target.detach()) 70 | 71 | policy_value = q(inputs) 72 | target_value = q.target(inputs).item() 73 | np.testing.assert_equal(policy_value.item(), -0.008584141731262207) 74 | np.testing.assert_equal(target_value, -0.008584141731262207) 75 | 76 | q.reinforce(loss(policy_value)) 77 | policy_value = q(inputs) 78 | target_value = q.target(inputs).item() 79 | np.testing.assert_equal(policy_value.item(), -0.20858412981033325) 80 | np.testing.assert_equal(target_value, -0.008584141731262207) 81 | 82 | q.reinforce(loss(policy_value)) 83 | policy_value = q(inputs) 84 | target_value = q.target(inputs).item() 85 | np.testing.assert_equal(policy_value.item(), -0.4085841178894043) 86 | np.testing.assert_equal(target_value, -0.008584141731262207) 87 | 88 | q.reinforce(loss(policy_value)) 89 | policy_value = q(inputs) 90 | target_value = q.target(inputs).item() 91 | np.testing.assert_equal(policy_value.item(), -0.6085841655731201) 92 | np.testing.assert_equal(target_value, -0.6085841655731201) 93 | 94 | q.reinforce(loss(policy_value)) 95 | policy_value = q(inputs) 96 | target_value = q.target(inputs).item() 97 | np.testing.assert_equal(policy_value.item(), -0.8085841536521912) 98 | np.testing.assert_equal(target_value, -0.6085841655731201) 99 | 100 | 101 | if __name__ == '__main__': 102 | unittest.main() 103 | -------------------------------------------------------------------------------- /tests/approximation/v_network_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | from torch import nn 4 | import torch_testing as tt 5 | from rlil.approximation.v_network import VNetwork 6 | from rlil.environments import State 7 | 8 | STATE_DIM = 2 9 | 10 | 11 | def loss(value, error): 12 | target = value + error 13 | return ((target.detach() - value) ** 2).mean() 14 | 15 | 16 | class TestVNetwork(unittest.TestCase): 17 | def setUp(self): 18 | torch.manual_seed(2) 19 | self.model = nn.Sequential( 20 | nn.Linear(STATE_DIM, 1) 21 | ) 22 | 23 | optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1) 24 | self.v = VNetwork(self.model, optimizer) 25 | 26 | def test_reinforce_list(self): 27 | states = State( 28 | torch.randn(5, STATE_DIM), 29 | mask=torch.tensor([1, 1, 0, 1, 0]) 30 | ) 31 | result = self.v(states) 32 | tt.assert_almost_equal(result, torch.tensor( 33 | [0.7053187, 0.3975691, 0., 0.2701665, 0.])) 34 | 35 | self.v.reinforce(loss(result, torch.tensor([1, -1, 1, 1, 1])).float()) 36 | result = self.v(states) 37 | tt.assert_almost_equal(result, torch.tensor( 38 | [0.9732854, 0.5453826, 0., 0.4344811, 0.])) 39 | 40 | def test_multi_reinforce(self): 41 | states = State( 42 | torch.randn(5, STATE_DIM), 43 | mask=torch.tensor([1, 1, 0, 1, 0, 0]) 44 | ) 45 | result1 = self.v(states[0:2]) 46 | self.v.reinforce(loss(result1, torch.tensor([1, 2])).float()) 47 | result2 = self.v(states[2:4]) 48 | self.v.reinforce(loss(result2, torch.tensor([1, 1])).float()) 49 | result3 = self.v(states[4:6]) 50 | self.v.reinforce(loss(result3, torch.tensor([1, 2])).float()) 51 | with self.assertRaises(Exception): 52 | self.v.reinforce(loss(result3, torch.tensor([1, 2])).float()) 53 | 54 | 55 | if __name__ == '__main__': 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /tests/benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/benchmark/__init__.py -------------------------------------------------------------------------------- /tests/benchmark/action_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | import torch 4 | import numpy as np 5 | from rlil.environments import Action, State 6 | import rlil.initializer as init 7 | 8 | 9 | @pytest.fixture() 10 | def set_action_space(): 11 | action_space = gym.spaces.Box( 12 | low=np.array([-1, -10]), high=np.array([1, 10])) 13 | Action.set_action_space(action_space) 14 | 15 | raw = torch.tensor([[0, 0], [2, 2], [-20, -20]], dtype=torch.float32) 16 | yield raw 17 | 18 | 19 | def test_create_action_debug(set_action_space, 20 | benchmark): 21 | init.enable_debug_mode() 22 | assert init.is_debug_mode() 23 | 24 | raw = set_action_space 25 | action = benchmark.pedantic(Action, 26 | kwargs={'raw': raw}, 27 | rounds=100, 28 | iterations=5) 29 | 30 | 31 | def test_create_action(set_action_space, 32 | benchmark): 33 | init.disable_debug_mode() 34 | assert not init.is_debug_mode() 35 | 36 | raw = set_action_space 37 | action = benchmark.pedantic(Action, 38 | kwargs={'raw': raw}, 39 | rounds=100, 40 | iterations=5) 41 | 42 | 43 | def get_features(action): 44 | return action.features 45 | 46 | 47 | def test_features_action_cpu(set_action_space, 48 | benchmark): 49 | raw = set_action_space 50 | action = Action(raw) 51 | 52 | benchmark.pedantic(get_features, 53 | rounds=100, 54 | kwargs={"action": action}, 55 | iterations=5) 56 | 57 | 58 | def test_features_action_cuda(set_action_space, 59 | benchmark): 60 | if not torch.cuda.is_available(): 61 | pytest.skip("CUDA is not available") 62 | 63 | raw = set_action_space 64 | action = Action(raw.to("cuda")) 65 | 66 | benchmark.pedantic(get_features, 67 | rounds=100, 68 | kwargs={"action": action}, 69 | iterations=5) 70 | -------------------------------------------------------------------------------- /tests/benchmark/cpu_gpu_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import numpy as np 4 | from rlil.environments import GymEnvironment, State 5 | from rlil.presets.continuous import ddpg 6 | 7 | 8 | def collect_samples(agent, env): 9 | while len(agent.replay_buffer) < 100: 10 | env.reset() 11 | while not env.done: 12 | env.step(agent.act(env.state, env.reward)) 13 | 14 | 15 | def test_ddpg_cuda(benchmark, use_gpu): 16 | if not torch.cuda.is_available(): 17 | pytest.skip("CUDA is not available") 18 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 19 | agent_fn = ddpg(replay_start_size=100) 20 | agent = agent_fn(env) 21 | collect_samples(agent, env) 22 | assert agent.should_train() 23 | benchmark.pedantic(agent.train, rounds=100) 24 | 25 | 26 | def test_ddpg_cpu(benchmark, use_cpu): 27 | if not torch.cuda.is_available(): 28 | pytest.skip("CUDA is not available") 29 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 30 | agent_fn = ddpg(replay_start_size=100) 31 | agent = agent_fn(env) 32 | collect_samples(agent, env) 33 | assert agent.should_train() 34 | benchmark.pedantic(agent.train, rounds=100) -------------------------------------------------------------------------------- /tests/benchmark/state_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | import torch 4 | import numpy as np 5 | from rlil.environments import Action, State 6 | import rlil.initializer as init 7 | 8 | 9 | def test_create_state_debug(benchmark): 10 | init.enable_debug_mode() 11 | assert init.is_debug_mode() 12 | 13 | raw = torch.randn(3, 4) 14 | benchmark.pedantic(State, 15 | kwargs={'raw': raw}, 16 | rounds=100, 17 | iterations=5) 18 | 19 | 20 | def test_create_state(benchmark): 21 | init.disable_debug_mode() 22 | assert not init.is_debug_mode() 23 | 24 | raw = torch.randn(3, 4) 25 | benchmark.pedantic(State, 26 | kwargs={'raw': raw}, 27 | rounds=100, 28 | iterations=5) 29 | -------------------------------------------------------------------------------- /tests/benchmark/train_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import numpy as np 4 | from rlil.environments import GymEnvironment, State 5 | from rlil.presets.continuous import ddpg, sac, td3, bc 6 | from ..presets.offline_continuous_test import get_transitions 7 | 8 | 9 | def collect_samples(agent, env): 10 | while len(agent.replay_buffer) < 100: 11 | env.reset() 12 | while not env.done: 13 | env.step(agent.act(env.state, env.reward)) 14 | 15 | 16 | def test_ddpg(benchmark): 17 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 18 | agent_fn = ddpg(replay_start_size=100) 19 | agent = agent_fn(env) 20 | collect_samples(agent, env) 21 | assert agent.should_train() 22 | benchmark.pedantic(agent.train, rounds=100) 23 | 24 | 25 | def test_sac(benchmark): 26 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 27 | agent_fn = sac(replay_start_size=100) 28 | agent = agent_fn(env) 29 | collect_samples(agent, env) 30 | assert agent.should_train() 31 | benchmark.pedantic(agent.train, rounds=100) 32 | 33 | 34 | def test_td3(benchmark): 35 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 36 | agent_fn = td3(replay_start_size=100) 37 | agent = agent_fn(env) 38 | collect_samples(agent, env) 39 | assert agent.should_train() 40 | benchmark.pedantic(agent.train, rounds=100) 41 | 42 | 43 | def test_bc(benchmark): 44 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 45 | transitions = get_transitions(env) 46 | agent_fn = bc(transitions) 47 | agent = agent_fn(env) 48 | assert len(transitions["obs"]) > 100 49 | benchmark.pedantic(agent.train, rounds=100) 50 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import torch 4 | from rlil.initializer import set_seed, enable_debug_mode, set_device, get_device 5 | from rlil import nn 6 | from rlil.environments import Action 7 | from rlil.policies.deterministic import DeterministicPolicyNetwork 8 | from rlil.memory import ExperienceReplayBuffer 9 | 10 | 11 | @pytest.fixture(scope="function", autouse=True) 12 | def seed(): 13 | """set random seed for testing""" 14 | set_seed(0) 15 | 16 | 17 | @pytest.fixture(scope="function", autouse=True) 18 | def debug(): 19 | enable_debug_mode() 20 | 21 | 22 | @pytest.fixture(scope="function", autouse=True) 23 | def reset_action_space(): 24 | Action._action_space = None 25 | 26 | 27 | @pytest.fixture 28 | def use_cpu(): 29 | pre_device = get_device() 30 | set_device("cpu") 31 | yield 32 | set_device(pre_device) 33 | 34 | 35 | @pytest.fixture 36 | def use_gpu(): 37 | pre_device = get_device() 38 | set_device("cuda") 39 | yield 40 | set_device(pre_device) 41 | -------------------------------------------------------------------------------- /tests/environments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/environments/__init__.py -------------------------------------------------------------------------------- /tests/environments/action_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import torch 4 | import torch_testing as tt 5 | import gym 6 | from rlil.environments.action import Action, action_decorator 7 | 8 | 9 | @pytest.fixture() 10 | def set_continuous_action_space(): 11 | action_space = gym.spaces.Box( 12 | low=np.array([-1, -10]), high=np.array([1, 10])) 13 | Action.set_action_space(action_space) 14 | 15 | 16 | @pytest.fixture() 17 | def set_discrete_action_space(): 18 | action_space = gym.spaces.Discrete(4) 19 | Action.set_action_space(action_space) 20 | 21 | 22 | def test_set_action_space_raises(): 23 | """ 24 | Action class should raise when the action_space is not set 25 | """ 26 | with pytest.raises(AssertionError): 27 | Action(torch.Tensor([[2, 3]])) 28 | 29 | 30 | def test_continuous_action(set_continuous_action_space): 31 | # GIVEN a set action_space 32 | 33 | # WHEN a new Action object with valid input is made 34 | # THEN the raw is equal to Action.raw 35 | raw = torch.tensor([[0, 0], [2, 2], [-20, -20]], dtype=torch.float32) 36 | action = Action(raw) 37 | tt.assert_equal(action.raw, raw) 38 | 39 | # WHEN a new Action object with a raw outside the action_space 40 | # THEN the action.features should clipped in the range 41 | tt.assert_equal(action.features, torch.tensor( 42 | [[0, 0], [1, 2], [-1, -10]], dtype=torch.float32)) 43 | 44 | # WHEN a new Action object with invalid input is made 45 | # THEN raise a assertion error 46 | with pytest.raises(AssertionError): 47 | raw = torch.randn(3, 5) 48 | action = Action(raw) 49 | 50 | 51 | def test_discrete_action(set_discrete_action_space): 52 | # GIVEN a set action_space 53 | 54 | # WHEN a new Action object with valid input is made 55 | # THEN the raw is equal to Action.raw 56 | raw = torch.tensor([1, 2, 3, 0]).unsqueeze(1) 57 | action = Action(raw) 58 | tt.assert_equal(action.raw, raw) 59 | 60 | # WHEN a new Action object with invalid input is made 61 | # THEN raise a assertion error 62 | with pytest.raises(AssertionError): 63 | raw = torch.tensor([5]) 64 | action = Action(raw) 65 | 66 | 67 | def test_from_list(set_continuous_action_space): 68 | action1 = Action(torch.randn(1, 2)) 69 | action2 = Action(torch.randn(1, 2)) 70 | action3 = Action(torch.randn(1, 2)) 71 | action = Action.from_list([action1, action2, action3]) 72 | tt.assert_equal(action.raw, torch.cat( 73 | (action1.raw, action2.raw, action3.raw))) 74 | 75 | 76 | def test_from_numpy(set_continuous_action_space): 77 | actions = np.array([[1, 2]]) 78 | action = Action.from_numpy(actions) 79 | tt.assert_equal(action.raw, torch.tensor([[1, 2]])) 80 | 81 | 82 | def test_raw_numpy(set_continuous_action_space): 83 | actions = np.array([[1, 2]]) 84 | action = Action.from_numpy(actions) 85 | np.testing.assert_equal(actions, action.raw_numpy()) 86 | 87 | 88 | def test_get_item(): 89 | action_space = gym.spaces.Box(low=np.array( 90 | [-1, -2, -3, -4]), high=np.array([1, 2, 3, 4])) 91 | Action.set_action_space(action_space) 92 | raw = torch.randn(3, 4) 93 | actions = Action(raw) 94 | action = actions[2] 95 | tt.assert_equal(action.raw, raw[2].unsqueeze(0)) 96 | 97 | 98 | def test_len(): 99 | action_space = gym.spaces.Box(low=np.array( 100 | [-1, -2, -3, -4]), high=np.array([1, 2, 3, 4])) 101 | Action.set_action_space(action_space) 102 | action = Action(torch.randn(3, 4)) 103 | assert len(action) == 3 104 | 105 | 106 | def test_action_decorator(): 107 | action_space = gym.spaces.Box(low=-1, high=1, shape=(2, )) 108 | Action.set_action_space(action_space) 109 | @action_decorator 110 | def act(): 111 | return torch.tensor([3, 4]).unsqueeze(0) 112 | 113 | action = act() 114 | tt.assert_equal(action.raw, torch.tensor([3, 4]).unsqueeze(0)) 115 | -------------------------------------------------------------------------------- /tests/environments/gym_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from rlil.environments.gym import GymEnvironment 4 | from rlil.environments import State, Action 5 | import torch 6 | import gym 7 | 8 | 9 | def test_env_discrete(): 10 | env = gym.make('CartPole-v0') 11 | env = GymEnvironment(env) 12 | env.reset() 13 | while not env._state.done: 14 | action = Action.action_space().sample() 15 | action = Action(torch.tensor([action]).unsqueeze(0)) 16 | state, reward = env.step(action) 17 | 18 | 19 | def test_env_continuous(): 20 | env = gym.make('LunarLanderContinuous-v2') 21 | env = GymEnvironment(env) 22 | env.reset() 23 | while not env._state.done: 24 | action = Action.action_space().sample() 25 | action = Action(torch.tensor([action])) 26 | state, reward = env.step(action) 27 | 28 | 29 | def test_append_time(): 30 | env = gym.make('LunarLanderContinuous-v2') 31 | env = GymEnvironment(env, append_time=True) 32 | state = env.reset() 33 | last_timestep = state.raw[0, -1].item() 34 | while not env._state.done: 35 | action = Action.action_space().sample() 36 | action = Action(torch.tensor([action])) 37 | state, reward = env.step(action) 38 | assert state.raw[0, -1].item() > last_timestep 39 | last_timestep = state.raw[0, -1].item() 40 | assert state.shape[1] == env._env.observation_space.shape[0] + 1 41 | -------------------------------------------------------------------------------- /tests/environments/state_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import torch 4 | import torch_testing as tt 5 | from rlil.environments.state import State 6 | 7 | DONE = torch.tensor( 8 | [0], 9 | dtype=torch.bool, 10 | ) 11 | 12 | NOT_DONE = torch.tensor( 13 | [1], 14 | dtype=torch.bool, 15 | ) 16 | 17 | 18 | def test_constructor_defaults(): 19 | raw = torch.randn(3, 4) 20 | state = State(raw) 21 | # state.features returns raw 22 | tt.assert_equal(state.features, raw) 23 | # state.mask returns 1 default 24 | tt.assert_equal(state.mask, torch.ones(3, dtype=torch.bool)) 25 | # state.raw == raw 26 | tt.assert_equal(state.raw, raw) 27 | assert state.info == [None] * 3 28 | 29 | 30 | def test_custom_constructor_args(): 31 | raw = torch.randn(3, 4) 32 | mask = torch.zeros(3).bool() 33 | info = ['a', 'b', 'c'] 34 | state = State(raw, mask=mask, info=info) 35 | tt.assert_equal(state.features, raw) 36 | # check zeros masks 37 | tt.assert_equal(state.mask, torch.zeros(3, dtype=torch.bool)) 38 | # check info constructor 39 | assert state.info == info 40 | 41 | 42 | def test_not_done(): 43 | state = State(torch.randn(1, 4)) 44 | assert not state.done 45 | 46 | 47 | def test_done(): 48 | raw = torch.randn(1, 4) 49 | state = State(raw, mask=DONE) 50 | assert state.done 51 | 52 | 53 | def test_from_list(): 54 | state1 = State(torch.randn(1, 4), mask=DONE, info=['a']) 55 | state2 = State(torch.randn(1, 4), mask=NOT_DONE, info=['b']) 56 | state3 = State(torch.randn(1, 4)) 57 | state = State.from_list([state1, state2, state3]) 58 | tt.assert_equal(state.raw, torch.cat( 59 | (state1.raw, state2.raw, state3.raw))) 60 | tt.assert_equal(state.mask, torch.tensor([0, 1, 1])) 61 | assert state.info == ['a', 'b', None] 62 | 63 | 64 | def test_from_numpy(): 65 | gym_obs = np.array([1, 2, 3]) 66 | done = True 67 | info = ['a'] 68 | with pytest.raises(AssertionError): 69 | state = State.from_numpy(gym_obs, done, info) 70 | gym_obs = np.random.randn(3, 5) 71 | done = np.zeros(3, dtype=np.bool) 72 | info = ['a'] 73 | state = State.from_numpy(gym_obs, done, info) 74 | 75 | tt.assert_equal(state.raw, torch.tensor(gym_obs, dtype=torch.float32), ) 76 | tt.assert_equal(state.done, torch.tensor(done)) 77 | assert state.info == ['a'] 78 | 79 | 80 | def test_raw_numpy(): 81 | np_raws = np.random.randn(3, 4) 82 | np_masks = np.ones(3) 83 | state = State(torch.tensor(np_raws), mask=torch.tensor(np_masks)) 84 | out_np_raws, out_np_dones = state.raw_numpy() 85 | np.testing.assert_equal(np_raws, out_np_raws) 86 | np.testing.assert_equal(np_masks, ~out_np_dones) 87 | 88 | 89 | def test_get_item(): 90 | raw = torch.randn(3, 4) 91 | states = State(raw) 92 | state = states[2] 93 | tt.assert_equal(state.raw, raw[2].unsqueeze(0)) 94 | tt.assert_equal(state.mask, NOT_DONE) 95 | assert state.info == [None] 96 | 97 | 98 | def test_len(): 99 | state = State(torch.randn(3, 4)) 100 | assert len(state) == 3 101 | -------------------------------------------------------------------------------- /tests/experiments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/experiments/__init__.py -------------------------------------------------------------------------------- /tests/experiments/experiment_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import torch 4 | import ray 5 | from rlil.presets.continuous import sac 6 | from rlil.environments import GymEnvironment 7 | from rlil.experiments import Experiment, Trainer 8 | from rlil.utils.writer import Writer 9 | from rlil.initializer import set_writer 10 | from rlil.samplers import AsyncSampler 11 | 12 | 13 | class MockWriter(Writer): 14 | def __init__(self, label): 15 | self.data = {} 16 | self.label = label 17 | self.sample_frames = 0 18 | self.sample_episodes = 1 19 | self.train_steps = 0 20 | 21 | def add_scalar(self, key, value, step="sample_frames"): 22 | key = key + "/" + step 23 | if key not in self.data: 24 | self.data[key] = {"values": [], "steps": []} 25 | self.data[key]["values"].append(value) 26 | self.data[key]["steps"].append(self._get_step_value(step)) 27 | 28 | def add_text(self, name, text, step="sample_frames"): 29 | pass 30 | 31 | def _get_step_value(self, _type): 32 | if _type == "sample_frames": 33 | return self.sample_frames 34 | if _type == "sample_episodes": 35 | return self.sample_episodes 36 | if _type == "train_steps": 37 | return self.train_steps 38 | return _type 39 | 40 | 41 | class MockExperiment(Experiment): 42 | def __init__( 43 | self, 44 | agent_fn, 45 | env, 46 | exp_info='default_experiments', 47 | num_workers=1, 48 | max_sample_frames=np.inf, 49 | max_sample_episodes=np.inf, 50 | ): 51 | 52 | # set writer 53 | agent_name = agent_fn.__name__ 54 | writer = self._make_writer(agent_name, env.name, exp_info) 55 | set_writer(writer) 56 | 57 | # start training 58 | agent = agent_fn(env) 59 | 60 | sampler = AsyncSampler(env, num_workers=num_workers) 61 | eval_sampler = AsyncSampler(env) 62 | 63 | trainer = Trainer( 64 | agent=agent, 65 | sampler=sampler, 66 | eval_sampler=eval_sampler, 67 | max_sample_frames=max_sample_frames, 68 | max_sample_episodes=max_sample_episodes 69 | ) 70 | 71 | trainer.start_training() 72 | 73 | def _make_writer(self, agent_name, env_name, 74 | exp_info="default_experiments"): 75 | self._writer = MockWriter(agent_name + '_' + env_name) 76 | return self._writer 77 | 78 | 79 | def test_adds_label(): 80 | ray.init(include_webui=False, ignore_reinit_error=True) 81 | env = GymEnvironment('Pendulum-v0', append_time=True) 82 | experiment = MockExperiment(sac(), env, max_sample_episodes=1) 83 | assert experiment._writer.label == "_sac_Pendulum-v0" 84 | 85 | 86 | @pytest.mark.skip() 87 | def test_writes_returns_eps(): 88 | ray.init(include_webui=False, ignore_reinit_error=True) 89 | env = GymEnvironment('Pendulum-v0', append_time=True) 90 | experiment = MockExperiment(sac(), env, max_sample_episodes=3) 91 | np.testing.assert_equal( 92 | experiment._writer.data["returns/episode"]["steps"], 93 | np.array([1, 2, 3]), 94 | ) 95 | 96 | 97 | if __name__ == "__main__": 98 | unittest.main() 99 | -------------------------------------------------------------------------------- /tests/experiments/trainer_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import torch 4 | from torch.optim import Adam 5 | import gym 6 | import time 7 | import ray 8 | from rlil.environments import GymEnvironment 9 | from rlil import nn 10 | from rlil.experiments import Trainer 11 | from rlil.samplers import AsyncSampler 12 | from rlil.memory import ExperienceReplayBuffer 13 | from rlil.initializer import set_replay_buffer 14 | from rlil.presets.continuous import sac 15 | from ..mock_agent import MockAgent 16 | 17 | 18 | @pytest.fixture 19 | def setUp(): 20 | ray.init(include_webui=False, ignore_reinit_error=True) 21 | 22 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 23 | 24 | replay_buffer_size = 100000 25 | replay_buffer = ExperienceReplayBuffer(replay_buffer_size, env) 26 | set_replay_buffer(replay_buffer) 27 | 28 | agent = MockAgent(env) 29 | num_workers = 3 30 | sampler = AsyncSampler( 31 | env, 32 | num_workers=num_workers, 33 | ) 34 | 35 | yield env, agent, sampler 36 | 37 | 38 | def test_trainer_frames(setUp): 39 | max_sample_frames = 100 40 | env, agent, sampler = setUp 41 | trainer = Trainer(agent, sampler, max_sample_frames=max_sample_frames) 42 | trainer.start_training() 43 | assert trainer._writer.sample_frames > max_sample_frames 44 | 45 | 46 | def test_trainer_episodes(setUp): 47 | max_sample_episodes = 5 48 | env, agent, sampler = setUp 49 | trainer = Trainer(agent, sampler, max_sample_episodes=max_sample_episodes) 50 | trainer.start_training() 51 | assert trainer._writer.sample_frames > max_sample_episodes 52 | 53 | 54 | def test_training(setUp): 55 | env, agent, sampler = setUp 56 | agent_fn = sac(replay_start_size=50) 57 | agent = agent_fn(env) 58 | 59 | trainer = Trainer(agent, sampler, max_sample_episodes=5) 60 | trainer.start_training() 61 | -------------------------------------------------------------------------------- /tests/memory/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/memory/__init__.py -------------------------------------------------------------------------------- /tests/memory/gae_wrapper_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import random 3 | import torch 4 | from torch.optim import Adam 5 | import numpy as np 6 | import gym 7 | import torch_testing as tt 8 | from rlil.approximation import VNetwork, FeatureNetwork 9 | from rlil.environments import State, Action, GymEnvironment 10 | from rlil.memory import ExperienceReplayBuffer, GaeWrapper 11 | from rlil.presets.continuous.models import fc_actor_critic 12 | from rlil.utils import Samples 13 | 14 | 15 | class DummyFeatures: 16 | def target(self, states): 17 | return states 18 | 19 | 20 | class DummyV: 21 | def target(self, feature): 22 | return torch.ones(len(feature)) 23 | 24 | 25 | @pytest.fixture 26 | def setUp(use_cpu): 27 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 28 | buffer = ExperienceReplayBuffer(1000, env) 29 | gae_buffer = GaeWrapper(buffer, discount_factor=1, lam=0.3) 30 | 31 | # base buffer 32 | states = [env.observation_space.sample() for i in range(4)] 33 | actions = [env.action_space.sample() for i in range(3)] 34 | states = State(torch.tensor(states)) 35 | states, next_states = states[:-1], states[1:] 36 | actions = Action(torch.tensor(actions)) 37 | rewards = torch.arange(0, 3, dtype=torch.float) 38 | samples = Samples(states, actions, rewards, next_states) 39 | gae_buffer.store(samples) 40 | 41 | feature_nw = DummyFeatures() 42 | v = DummyV() 43 | yield gae_buffer, feature_nw, v 44 | 45 | 46 | def test_advantage(setUp): 47 | gae_buffer, feature_nw, v = setUp 48 | 49 | states, _, rewards, next_states, _, _ = gae_buffer.get_all_transitions() 50 | values = v.target(feature_nw.target(states)) 51 | next_values = v.target(feature_nw.target(next_states)) 52 | advantages = gae_buffer.compute_gae(rewards, values, 53 | next_values, next_states.mask) 54 | 55 | # rewards: [0, 1, 2] 56 | # td_errors: [0, 1, 2] 57 | expected = torch.tensor([0 + 1 * 0.3 + 2 * 0.3 * 0.3, 58 | 1 + 2 * 0.3, 59 | 2]) 60 | tt.assert_almost_equal( 61 | advantages, 62 | (expected - expected.mean()) / expected.std(), decimal=3) 63 | -------------------------------------------------------------------------------- /tests/memory/gail_wrapper_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import random 3 | import torch 4 | from torch.optim import Adam 5 | import numpy as np 6 | import gym 7 | import torch_testing as tt 8 | from rlil.environments import State, Action, GymEnvironment 9 | from rlil.memory import ExperienceReplayBuffer, GailWrapper 10 | from rlil.presets.continuous.models import fc_discriminator 11 | from rlil.approximation import Discriminator 12 | from rlil.initializer import set_device 13 | from rlil.utils import Samples 14 | 15 | 16 | @pytest.fixture 17 | def setUp(use_cpu): 18 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 19 | replay_buffer = ExperienceReplayBuffer(1000, env) 20 | 21 | # base buffer 22 | states = State(torch.tensor([env.observation_space.sample()]*100)) 23 | actions = Action(torch.tensor([env.action_space.sample()]*99)) 24 | rewards = torch.arange(0, 99, dtype=torch.float) 25 | samples = Samples(states[:-1], actions, rewards, states[1:]) 26 | replay_buffer.store(samples) 27 | 28 | # expert buffer 29 | exp_replay_buffer = ExperienceReplayBuffer(1000, env) 30 | exp_states = State(torch.tensor([env.observation_space.sample()]*100)) 31 | exp_actions = Action(torch.tensor([env.action_space.sample()]*99)) 32 | exp_rewards = torch.arange(100, 199, dtype=torch.float) 33 | exp_samples = Samples( 34 | exp_states[:-1], exp_actions, exp_rewards, exp_states[1:]) 35 | exp_replay_buffer.store(exp_samples) 36 | # discriminator 37 | discriminator_model = fc_discriminator(env) 38 | discriminator_optimizer = Adam(discriminator_model.parameters()) 39 | discriminator = Discriminator(discriminator_model, 40 | discriminator_optimizer) 41 | 42 | gail_buffer = GailWrapper(replay_buffer, exp_replay_buffer, discriminator) 43 | 44 | samples = { 45 | "buffer": {"states": states, 46 | "actions": actions, 47 | "rewards": rewards}, 48 | "expert": {"states": states, 49 | "actions": actions, 50 | "rewards": rewards}, 51 | } 52 | yield gail_buffer, samples 53 | 54 | 55 | def test_sample(setUp): 56 | gail_buffer, samples = setUp 57 | res_states, res_actions, res_rewards, res_next_states, _, _ = \ 58 | gail_buffer.sample(4) 59 | 60 | # test states 61 | tt.assert_equal(res_states.features[0], 62 | samples["buffer"]["states"].features[0]) 63 | 64 | # test actions 65 | tt.assert_equal(res_actions.features[0], 66 | samples["buffer"]["actions"].features[0]) 67 | 68 | # test next_states 69 | tt.assert_equal( 70 | res_next_states.features[0], samples["buffer"]["states"].features[0]) 71 | 72 | 73 | def test_sample_both(setUp): 74 | gail_buffer, samples = setUp 75 | samples, expert_samples = gail_buffer.sample_both(4) 76 | 77 | 78 | def test_store(setUp): 79 | gail_buffer, samples = setUp 80 | assert len(gail_buffer) == 99 81 | 82 | gail_samples = Samples(samples["buffer"]["states"][:-1], 83 | samples["buffer"]["actions"], 84 | samples["buffer"]["rewards"], 85 | samples["buffer"]["states"][1:]) 86 | gail_buffer.store(gail_samples) 87 | 88 | assert len(gail_buffer) == 198 89 | 90 | 91 | def test_clear(setUp): 92 | gail_buffer, samples = setUp 93 | gail_buffer.clear() 94 | assert len(gail_buffer) == 0 95 | assert len(gail_buffer.expert_buffer) != 0 96 | -------------------------------------------------------------------------------- /tests/memory/sqil_wrapper_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import random 3 | import torch 4 | from torch.optim import Adam 5 | import numpy as np 6 | import gym 7 | import torch_testing as tt 8 | from rlil.environments import State, Action, GymEnvironment 9 | from rlil.memory import ExperienceReplayBuffer, SqilWrapper 10 | from rlil.initializer import set_device 11 | from rlil.utils import Samples 12 | 13 | 14 | @pytest.fixture 15 | def setUp(use_cpu): 16 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 17 | replay_buffer = ExperienceReplayBuffer(1000, env) 18 | 19 | # base buffer 20 | states = State(torch.tensor([env.observation_space.sample()]*10)) 21 | actions = Action(torch.tensor([env.action_space.sample()]*9)) 22 | rewards = torch.arange(0, 9, dtype=torch.float) 23 | samples = Samples(states[:-1], actions, rewards, states[1:]) 24 | replay_buffer.store(samples) 25 | 26 | # expert buffer 27 | exp_replay_buffer = ExperienceReplayBuffer(1000, env) 28 | exp_states = State(torch.tensor([env.observation_space.sample()]*10)) 29 | exp_actions = Action(torch.tensor([env.action_space.sample()]*9)) 30 | exp_rewards = torch.arange(10, 19, dtype=torch.float) 31 | exp_samples = Samples( 32 | exp_states[:-1], exp_actions, exp_rewards, exp_states[1:]) 33 | exp_replay_buffer.store(exp_samples) 34 | sqil_buffer = SqilWrapper(replay_buffer, exp_replay_buffer) 35 | 36 | samples = { 37 | "buffer": {"states": states, 38 | "actions": actions, 39 | "rewards": rewards}, 40 | "expert": {"states": states, 41 | "actions": actions, 42 | "rewards": rewards}, 43 | } 44 | yield sqil_buffer, samples 45 | 46 | 47 | def test_sample(setUp): 48 | sqil_buffer, samples = setUp 49 | res_states, res_actions, res_rewards, res_next_states, _, _ = \ 50 | sqil_buffer.sample(40) 51 | 52 | # test rewards 53 | # half of the rewards are 1 and the others are 0 54 | assert res_rewards.sum() == len(res_rewards) / 2 55 | -------------------------------------------------------------------------------- /tests/mock_agent.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import torch 4 | from rlil import nn 5 | from rlil.environments import Action 6 | from rlil.policies.deterministic import DeterministicPolicyNetwork 7 | from rlil.memory import ExperienceReplayBuffer 8 | from rlil.initializer import get_replay_buffer, get_n_step 9 | from rlil.utils import Samples 10 | 11 | 12 | class MockAgent: 13 | def __init__(self, env): 14 | model = nn.Sequential( 15 | nn.Flatten(), 16 | nn.Linear(env.state_space.shape[0], 17 | Action.action_space().shape[0]) 18 | ) 19 | self.policy_model = DeterministicPolicyNetwork( 20 | model, Action.action_space()) 21 | 22 | self._state = None 23 | self._action = None 24 | self.replay_buffer = get_replay_buffer() 25 | 26 | def act(self, state, reward): 27 | samples = Samples(self._state, self._action, reward, state) 28 | self.replay_buffer.store(samples) 29 | self._state = state 30 | 31 | with torch.no_grad(): 32 | action = self.policy_model( 33 | state.to(self.policy_model.device)) 34 | 35 | self._action = Action(action).to("cpu") 36 | return self._action 37 | 38 | def make_lazy_agent(self): 39 | return MockLazyAgent(self.policy_model) 40 | 41 | def train(self): 42 | pass 43 | 44 | 45 | class MockLazyAgent: 46 | def __init__(self, policy_model): 47 | self._state = None 48 | self._action = None 49 | self.policy_model = policy_model 50 | self.replay_buffer = None 51 | # for N step replay buffer 52 | self._n_step, self._discount_factor = get_n_step() 53 | 54 | def set_replay_buffer(self, env): 55 | self.replay_buffer = ExperienceReplayBuffer( 56 | 1e7, env, n_step=self._n_step, 57 | discount_factor=self._discount_factor) 58 | 59 | def act(self, state, reward): 60 | samples = Samples(self._state, self._action, reward, state) 61 | self.replay_buffer.store(samples) 62 | self._state = state 63 | 64 | with torch.no_grad(): 65 | action = self.policy_model( 66 | state.to(self.policy_model.device)) 67 | 68 | self._action = Action(action).to("cpu") 69 | return self._action 70 | 71 | def compute_priorities(self, samples): 72 | return None 73 | -------------------------------------------------------------------------------- /tests/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/nn/__init__.py -------------------------------------------------------------------------------- /tests/policies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/policies/__init__.py -------------------------------------------------------------------------------- /tests/policies/bcq_deterministic_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import unittest 3 | import torch 4 | import torch_testing as tt 5 | import numpy as np 6 | from gym.spaces import Box 7 | from rlil import nn 8 | from rlil.approximation import FixedTarget 9 | from rlil.environments import State, Action, squash_action 10 | from rlil.policies import BCQDeterministicPolicy 11 | 12 | STATE_DIM = 2 13 | ACTION_DIM = 3 14 | 15 | 16 | class TestBCQDeterministic(unittest.TestCase): 17 | def setUp(self): 18 | torch.manual_seed(2) 19 | self.model = nn.Sequential( 20 | nn.Linear0(STATE_DIM + ACTION_DIM, ACTION_DIM) 21 | ) 22 | self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01) 23 | self.space = Box(np.array([-1, -1, -1]), 24 | np.array([1, 1, 1]), dtype=np.float32) 25 | self.policy = BCQDeterministicPolicy( 26 | self.model, 27 | self.optimizer, 28 | self.space 29 | ) 30 | Action.set_action_space(self.space) 31 | 32 | def test_output_shape(self): 33 | state = State(torch.randn(1, STATE_DIM)) 34 | vae_action = Action(torch.randn(1, ACTION_DIM)) 35 | action = self.policy(state, vae_action) 36 | self.assertEqual(action.shape, (1, ACTION_DIM)) 37 | state = State(torch.randn(5, STATE_DIM)) 38 | vae_action = Action(torch.randn(5, ACTION_DIM)) 39 | action = self.policy(state, vae_action) 40 | self.assertEqual(action.shape, (5, ACTION_DIM)) 41 | 42 | def test_step_one(self): 43 | state = State(torch.randn(1, STATE_DIM)) 44 | vae_action = Action(torch.randn(1, ACTION_DIM)) 45 | self.policy(state, vae_action) 46 | self.policy.step() 47 | 48 | @pytest.mark.skip 49 | def test_converge(self): 50 | state = State(torch.randn(1, STATE_DIM)) 51 | vae_action = Action(torch.randn(1, ACTION_DIM)) 52 | target = vae_action.features + torch.tensor([[0.25, 0.5, -0.5]]) 53 | 54 | for _ in range(0, 200): 55 | action = self.policy(state, vae_action) 56 | loss = ((target - action) ** 2).mean() 57 | loss.backward() 58 | self.policy.step() 59 | 60 | self.assertLess(loss, 0.001) 61 | 62 | @pytest.mark.skip 63 | def test_target(self): 64 | self.policy = BCQDeterministicPolicy( 65 | self.model, 66 | self.optimizer, 67 | self.space, 68 | target=FixedTarget(3) 69 | ) 70 | 71 | # choose initial action 72 | state = State(torch.ones(1, STATE_DIM)) 73 | vae_action = Action(torch.ones(1, ACTION_DIM)) 74 | action = self.policy(state, vae_action) 75 | tt.assert_equal(action, squash_action( 76 | vae_action.features, action_space=self.space)) 77 | 78 | # run update step, make sure target network doesn't change 79 | action.sum().backward(retain_graph=True) 80 | self.policy.step() 81 | tt.assert_equal(self.policy.target(state, vae_action), 82 | squash_action(vae_action.features, action_space=self.space)) 83 | 84 | # again... 85 | action.sum().backward(retain_graph=True) 86 | self.policy.step() 87 | tt.assert_equal(self.policy.target(state, vae_action), 88 | squash_action(vae_action.features, action_space=self.space)) 89 | 90 | # third time, target should be updated 91 | action.sum().backward(retain_graph=True) 92 | self.policy.step() 93 | # tt.assert_allclose( 94 | # self.policy.eval(state, vae_action), 95 | # torch.tensor([[-0.595883, -0.595883, -0.595883]]), 96 | # atol=1e-4, 97 | # ) 98 | 99 | 100 | if __name__ == '__main__': 101 | unittest.main() 102 | -------------------------------------------------------------------------------- /tests/policies/deterministic_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import unittest 3 | import torch 4 | import torch_testing as tt 5 | import numpy as np 6 | from gym.spaces import Box 7 | from rlil import nn 8 | from rlil.approximation import FixedTarget 9 | from rlil.environments import State 10 | from rlil.policies import DeterministicPolicy 11 | 12 | STATE_DIM = 2 13 | ACTION_DIM = 3 14 | 15 | 16 | class TestDeterministic(unittest.TestCase): 17 | def setUp(self): 18 | torch.manual_seed(2) 19 | self.model = nn.Sequential( 20 | nn.Linear0(STATE_DIM, ACTION_DIM) 21 | ) 22 | self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01) 23 | self.space = Box(np.array([-1, -1, -1]), 24 | np.array([1, 1, 1]), dtype=np.float32) 25 | self.policy = DeterministicPolicy( 26 | self.model, 27 | self.optimizer, 28 | self.space 29 | ) 30 | 31 | def test_output_shape(self): 32 | state = State(torch.randn(1, STATE_DIM)) 33 | action = self.policy(state) 34 | self.assertEqual(action.shape, (1, ACTION_DIM)) 35 | state = State(torch.randn(5, STATE_DIM)) 36 | action = self.policy(state) 37 | self.assertEqual(action.shape, (5, ACTION_DIM)) 38 | 39 | def test_step_one(self): 40 | state = State(torch.randn(1, STATE_DIM)) 41 | self.policy(state) 42 | self.policy.step() 43 | 44 | def test_converge(self): 45 | state = State(torch.randn(1, STATE_DIM)) 46 | target = torch.tensor([0.25, 0.5, -0.5]) 47 | 48 | for _ in range(0, 200): 49 | action = self.policy(state) 50 | loss = ((target - action) ** 2).mean() 51 | self.policy.reinforce(loss) 52 | 53 | self.assertLess(loss, 0.001) 54 | 55 | @pytest.mark.skip 56 | def test_target(self): 57 | self.policy = DeterministicPolicy( 58 | self.model, 59 | self.optimizer, 60 | self.space, 61 | target=FixedTarget(3) 62 | ) 63 | 64 | # choose initial action 65 | state = State(torch.ones(1, STATE_DIM)) 66 | action = self.policy(state) 67 | tt.assert_equal(action, torch.zeros(1, ACTION_DIM)) 68 | 69 | # run update step, make sure target network doesn't change 70 | action.sum().backward(retain_graph=True) 71 | self.policy.step() 72 | tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) 73 | 74 | # again... 75 | action.sum().backward(retain_graph=True) 76 | self.policy.step() 77 | tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) 78 | 79 | # third time, target should be updated 80 | action.sum().backward(retain_graph=True) 81 | self.policy.step() 82 | 83 | 84 | if __name__ == '__main__': 85 | unittest.main() 86 | -------------------------------------------------------------------------------- /tests/policies/gaussian_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | import torch_testing as tt 6 | from gym.spaces import Box 7 | from rlil.environments import State 8 | from rlil.policies import GaussianPolicy 9 | 10 | STATE_DIM = 2 11 | ACTION_DIM = 3 12 | 13 | 14 | class TestGaussian(unittest.TestCase): 15 | def setUp(self): 16 | 17 | torch.manual_seed(2) 18 | self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1])) 19 | self.model = nn.Sequential( 20 | nn.Linear(STATE_DIM, ACTION_DIM * 2) 21 | ) 22 | optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01) 23 | self.policy = GaussianPolicy(self.model, optimizer, self.space) 24 | 25 | def test_output_shape(self): 26 | state = State(torch.randn(1, STATE_DIM)) 27 | action = self.policy(state).sample() 28 | self.assertEqual(action.shape, (1, ACTION_DIM)) 29 | state = State(torch.randn(5, STATE_DIM)) 30 | action = self.policy(state).sample() 31 | self.assertEqual(action.shape, (5, ACTION_DIM)) 32 | 33 | def test_reinforce_one(self): 34 | state = State(torch.randn(1, STATE_DIM)) 35 | dist = self.policy(state) 36 | action = dist.sample() 37 | log_prob1 = dist.log_prob(action) 38 | loss = -log_prob1.mean() 39 | self.policy.reinforce(loss) 40 | 41 | dist = self.policy(state) 42 | log_prob2 = dist.log_prob(action) 43 | 44 | self.assertGreater(log_prob2.item(), log_prob1.item()) 45 | 46 | def test_converge(self): 47 | state = State(torch.randn(1, STATE_DIM)) 48 | target = torch.tensor([1., 2., -1.]) 49 | 50 | for _ in range(0, 1000): 51 | dist = self.policy(state) 52 | action = dist.sample() 53 | log_prob = dist.log_prob(action) 54 | error = ((target - action) ** 2).mean() 55 | loss = (error * log_prob).mean() 56 | self.policy.reinforce(loss) 57 | 58 | self.assertTrue(error < 1) 59 | 60 | 61 | if __name__ == '__main__': 62 | unittest.main() 63 | -------------------------------------------------------------------------------- /tests/policies/soft_deterministic_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | import torch_testing as tt 6 | from gym.spaces import Box 7 | from rlil.environments import State 8 | from rlil.policies import SoftDeterministicPolicy 9 | 10 | STATE_DIM = 2 11 | ACTION_DIM = 3 12 | 13 | 14 | @pytest.fixture 15 | def setUp(): 16 | torch.manual_seed(2) 17 | space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1])) 18 | model = nn.Sequential( 19 | nn.Linear(STATE_DIM, ACTION_DIM * 2) 20 | ) 21 | optimizer = torch.optim.RMSprop(model.parameters(), lr=0.01) 22 | policy = SoftDeterministicPolicy(model, optimizer, space) 23 | yield policy 24 | 25 | 26 | def test_output_shape(setUp): 27 | policy = setUp 28 | state = State(torch.randn(1, STATE_DIM)) 29 | action, _ = policy(state) 30 | assert action.shape == (1, ACTION_DIM) 31 | 32 | state = State(torch.randn(5, STATE_DIM)) 33 | action, _ = policy(state) 34 | assert action.shape == (5, ACTION_DIM) 35 | 36 | 37 | def test_reinforce_one(setUp): 38 | policy = setUp 39 | state = State(torch.randn(1, STATE_DIM)) 40 | action, log_prob1 = policy(state) 41 | loss = -log_prob1.mean() 42 | policy.reinforce(loss) 43 | 44 | action, log_prob2 = policy(state) 45 | 46 | assert log_prob2.item() > log_prob1.item() 47 | 48 | 49 | def test_sample_multiple(setUp): 50 | policy = setUp 51 | state = State(torch.randn(5, STATE_DIM)) 52 | actions, raw_actions = policy.sample_multiple(state, num_sample=10) 53 | assert actions.shape == (5, 10, ACTION_DIM) 54 | assert raw_actions.shape == (5, 10, ACTION_DIM) 55 | -------------------------------------------------------------------------------- /tests/policies/softmax_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | from torch import nn 4 | import torch_testing as tt 5 | from rlil.environments import State 6 | from rlil.policies import SoftmaxPolicy 7 | 8 | STATE_DIM = 2 9 | ACTIONS = 3 10 | 11 | 12 | class TestSoftmax(unittest.TestCase): 13 | def setUp(self): 14 | torch.manual_seed(2) 15 | self.model = nn.Sequential( 16 | nn.Linear(STATE_DIM, ACTIONS) 17 | ) 18 | optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1) 19 | self.policy = SoftmaxPolicy(self.model, optimizer) 20 | 21 | def test_run(self): 22 | state1 = State(torch.randn(1, STATE_DIM)) 23 | dist1 = self.policy(state1) 24 | action1 = dist1.sample() 25 | log_prob1 = dist1.log_prob(action1) 26 | self.assertEqual(action1.item(), 0) 27 | 28 | state2 = State(torch.randn(1, STATE_DIM)) 29 | dist2 = self.policy(state2) 30 | action2 = dist2.sample() 31 | log_prob2 = dist2.log_prob(action2) 32 | self.assertEqual(action2.item(), 2) 33 | 34 | loss = -(torch.tensor([-1, 1000000]) * 35 | torch.cat((log_prob1, log_prob2))).mean() 36 | self.policy.reinforce(loss) 37 | 38 | state3 = State(torch.randn(1, STATE_DIM)) 39 | dist3 = self.policy(state3) 40 | action3 = dist3.sample() 41 | self.assertEqual(action3.item(), 2) 42 | 43 | def test_multi_action(self): 44 | states = State(torch.randn(3, STATE_DIM)) 45 | actions = self.policy(states).sample() 46 | tt.assert_equal(actions, torch.tensor([2, 2, 0])) 47 | 48 | def test_list(self): 49 | torch.manual_seed(1) 50 | states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1])) 51 | dist = self.policy(states) 52 | actions = dist.sample() 53 | log_probs = dist.log_prob(actions) 54 | tt.assert_equal(actions, torch.tensor([1, 2, 1])) 55 | loss = -(torch.tensor([[1, 2, 3]]) * log_probs).mean() 56 | self.policy.reinforce(loss) 57 | 58 | def test_reinforce(self): 59 | def loss(log_probs): 60 | return -log_probs.mean() 61 | 62 | states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 1, 1])) 63 | actions = self.policy.eval(states).sample() 64 | 65 | # notice the values increase with each successive reinforce 66 | log_probs = self.policy(states).log_prob(actions) 67 | tt.assert_almost_equal(log_probs, torch.tensor( 68 | [-0.84, -0.62, -0.757]), decimal=3) 69 | self.policy.reinforce(loss(log_probs)) 70 | log_probs = self.policy(states).log_prob(actions) 71 | tt.assert_almost_equal(log_probs, torch.tensor( 72 | [-0.811, -0.561, -0.701]), decimal=3) 73 | self.policy.reinforce(loss(log_probs)) 74 | log_probs = self.policy(states).log_prob(actions) 75 | tt.assert_almost_equal(log_probs, torch.tensor( 76 | [-0.785, -0.51, -0.651]), decimal=3) 77 | 78 | 79 | if __name__ == '__main__': 80 | unittest.main() 81 | -------------------------------------------------------------------------------- /tests/presets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/presets/__init__.py -------------------------------------------------------------------------------- /tests/presets/offline_continuous_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | from rlil.environments import GymEnvironment 4 | from rlil.presets.continuous import bcq, bc, vae_bc, bear, brac 5 | from rlil.presets import env_validation, trainer_validation 6 | from rlil.memory import ExperienceReplayBuffer 7 | from rlil.environments import Action 8 | from rlil.initializer import set_replay_buffer 9 | from copy import deepcopy 10 | from ..mock_agent import MockAgent 11 | 12 | 13 | def get_transitions(env): 14 | replay_buffer = ExperienceReplayBuffer(1000, env) 15 | set_replay_buffer(replay_buffer) 16 | agent = MockAgent(env) 17 | 18 | while len(agent.replay_buffer) < 200: 19 | env.reset() 20 | while not env.done: 21 | env.step(agent.act(env.state, env.reward)) 22 | 23 | return agent.replay_buffer.get_all_transitions(return_cpprb=True) 24 | 25 | 26 | def test_bcq(): 27 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 28 | transitions = get_transitions(env) 29 | assert len(transitions["obs"]) > 100 30 | 31 | env_validation(bcq(transitions), env, done_step=50) 32 | trainer_validation(bcq(transitions), env) 33 | 34 | 35 | def test_bear(): 36 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 37 | transitions = get_transitions(env) 38 | assert len(transitions["obs"]) > 100 39 | 40 | env_validation(bear(transitions), env, done_step=50) 41 | trainer_validation(bear(transitions), env) 42 | 43 | 44 | def test_brac(): 45 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 46 | transitions = get_transitions(env) 47 | assert len(transitions["obs"]) > 100 48 | 49 | env_validation(brac(transitions, bc_iters=5), env, done_step=50) 50 | trainer_validation(brac(transitions, bc_iters=5), env) 51 | 52 | 53 | def test_bc(): 54 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 55 | transitions = get_transitions(env) 56 | assert len(transitions["obs"]) > 100 57 | 58 | env_validation(bc(transitions), env, done_step=50) 59 | trainer_validation(bc(transitions), env) 60 | 61 | 62 | def test_vae_bc(): 63 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 64 | transitions = get_transitions(env) 65 | assert len(transitions["obs"]) > 100 66 | 67 | env_validation(vae_bc(transitions), env, done_step=50) 68 | trainer_validation(vae_bc(transitions), env) 69 | -------------------------------------------------------------------------------- /tests/presets/online_continuous_test.py: -------------------------------------------------------------------------------- 1 | import ptvsd 2 | import pytest 3 | from rlil.environments import GymEnvironment 4 | from rlil.presets.continuous import vac, ddpg, sac, td3, noisy_td3, ppo, rs_mpc 5 | from rlil.presets import env_validation, trainer_validation 6 | from rlil.initializer import set_device 7 | 8 | 9 | def test_vac(): 10 | env = GymEnvironment("LunarLanderContinuous-v2", append_time=True) 11 | env_validation(vac(replay_start_size=50), env, done_step=50) 12 | trainer_validation(vac(replay_start_size=50), env) 13 | 14 | 15 | def test_ddpg(): 16 | env = GymEnvironment("LunarLanderContinuous-v2", append_time=True) 17 | env_validation(ddpg(replay_start_size=50), env, done_step=50) 18 | trainer_validation(ddpg(replay_start_size=50), env) 19 | 20 | 21 | def test_sac(use_cpu): 22 | env = GymEnvironment("LunarLanderContinuous-v2", append_time=True) 23 | env_validation(sac(replay_start_size=50), env, done_step=50) 24 | trainer_validation(sac(replay_start_size=50), env) 25 | 26 | 27 | def test_n_step(): 28 | env = GymEnvironment("LunarLanderContinuous-v2", append_time=True) 29 | for preset in [ddpg, td3, sac]: 30 | agent_fn = preset(n_step=5) 31 | agent = agent_fn(env) 32 | lazy_agent = agent.make_lazy_agent() 33 | lazy_agent.set_replay_buffer(env) 34 | assert lazy_agent._n_step == 5 35 | 36 | 37 | def test_prioritized(): 38 | env = GymEnvironment("LunarLanderContinuous-v2", append_time=True) 39 | for preset in [ddpg, td3, sac]: 40 | env_validation(preset(prioritized=True, replay_start_size=50), 41 | env, done_step=50) 42 | 43 | 44 | def test_td3(): 45 | env = GymEnvironment("LunarLanderContinuous-v2", append_time=True) 46 | env_validation(td3(replay_start_size=50), env, done_step=50) 47 | trainer_validation(td3(replay_start_size=50), env) 48 | 49 | 50 | def test_noisy_td3(): 51 | env = GymEnvironment("LunarLanderContinuous-v2", append_time=True) 52 | env_validation(noisy_td3(replay_start_size=50), env, done_step=50) 53 | trainer_validation(noisy_td3(replay_start_size=50), env) 54 | 55 | 56 | def test_ppo(): 57 | env = GymEnvironment("LunarLanderContinuous-v2", append_time=True) 58 | env_validation(ppo(replay_start_size=5), env, done_step=50) 59 | trainer_validation(ppo(replay_start_size=50), env) 60 | 61 | 62 | def test_rs_mpc(): 63 | env = GymEnvironment("Pendulum-v0", append_time=True) 64 | env_validation(rs_mpc(replay_start_size=5), env, done_step=50) 65 | trainer_validation(rs_mpc(replay_start_size=5), env) 66 | 67 | 68 | def test_apex(use_cpu): 69 | env = GymEnvironment("LunarLanderContinuous-v2", append_time=True) 70 | for preset in [ddpg, td3, sac]: 71 | trainer_validation( 72 | preset(replay_start_size=5, use_apex=True), env, apex=True) 73 | -------------------------------------------------------------------------------- /tests/presets/online_il_continuous_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | from rlil.environments import GymEnvironment 4 | from rlil.presets.continuous import airl, gail, sqil, td3, sac, ppo 5 | from rlil.presets import env_validation, trainer_validation 6 | from rlil.memory import ExperienceReplayBuffer 7 | from rlil.environments import Action 8 | from rlil.initializer import set_replay_buffer, get_writer 9 | from copy import deepcopy 10 | from ..mock_agent import MockAgent 11 | 12 | 13 | def get_transitions(env): 14 | replay_buffer = ExperienceReplayBuffer(1000, env) 15 | set_replay_buffer(replay_buffer) 16 | agent = MockAgent(env) 17 | 18 | while len(agent.replay_buffer) < 100: 19 | env.reset() 20 | while not env.done: 21 | env.step(agent.act(env.state, env.reward)) 22 | 23 | return agent.replay_buffer.get_all_transitions(return_cpprb=True) 24 | 25 | 26 | def test_gail(use_cpu): 27 | env = GymEnvironment("LunarLanderContinuous-v2", append_time=True) 28 | transitions = get_transitions(env) 29 | base_agent_fn = td3(replay_start_size=0) 30 | assert len(transitions["obs"]) > 100 31 | 32 | env_validation(gail(transitions=transitions, 33 | base_agent_fn=base_agent_fn, 34 | replay_start_size=10), env, done_step=50) 35 | trainer_validation(gail(transitions=transitions, 36 | base_agent_fn=base_agent_fn, 37 | replay_start_size=10), env) 38 | 39 | writer = get_writer() 40 | assert writer.train_steps > 1 41 | 42 | 43 | def test_sqil(use_cpu): 44 | env = GymEnvironment("LunarLanderContinuous-v2", append_time=True) 45 | transitions = get_transitions(env) 46 | base_agent_fn = sac(replay_start_size=0) 47 | assert len(transitions["obs"]) > 100 48 | 49 | env_validation(sqil(transitions=transitions, 50 | base_agent_fn=base_agent_fn, 51 | replay_start_size=10), env, done_step=50) 52 | trainer_validation(sqil(transitions=transitions, 53 | base_agent_fn=base_agent_fn, 54 | replay_start_size=10), env) 55 | 56 | writer = get_writer() 57 | assert writer.train_steps > 1 58 | 59 | 60 | def test_airl(): 61 | env = GymEnvironment("LunarLanderContinuous-v2", append_time=True) 62 | transitions = get_transitions(env) 63 | base_agent_fn = ppo(replay_start_size=0) 64 | assert len(transitions["obs"]) > 100 65 | 66 | env_validation(airl(transitions=transitions, 67 | base_agent_fn=base_agent_fn, 68 | replay_start_size=10), env, done_step=50) 69 | trainer_validation(airl(transitions=transitions, 70 | base_agent_fn=base_agent_fn, 71 | replay_start_size=10), env) 72 | 73 | writer = get_writer() 74 | assert writer.train_steps > 1 75 | -------------------------------------------------------------------------------- /tests/samplers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/samplers/__init__.py -------------------------------------------------------------------------------- /tests/samplers/asyncsampler_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import torch 4 | import gym 5 | import time 6 | import warnings 7 | import ray 8 | from rlil import nn 9 | from rlil.environments import GymEnvironment, Action 10 | from rlil.policies.deterministic import DeterministicPolicyNetwork 11 | from rlil.samplers import AsyncSampler, StartInfo 12 | from rlil.memory import ExperienceReplayBuffer 13 | from rlil.initializer import set_replay_buffer 14 | from ..mock_agent import MockAgent 15 | 16 | 17 | @pytest.fixture() 18 | def setUp(): 19 | ray.init(include_webui=False, ignore_reinit_error=True) 20 | 21 | env = GymEnvironment('LunarLanderContinuous-v2', append_time=True) 22 | 23 | replay_buffer_size = 100000 24 | replay_buffer = ExperienceReplayBuffer(replay_buffer_size, env) 25 | set_replay_buffer(replay_buffer) 26 | 27 | agent = MockAgent(env) 28 | 29 | yield {"env": env, "agent": agent} 30 | 31 | 32 | def test_sampler_episode(setUp): 33 | env = setUp["env"] 34 | agent = setUp["agent"] 35 | 36 | num_workers = 3 37 | worker_episodes = 6 38 | sampler = AsyncSampler( 39 | env, 40 | num_workers=num_workers, 41 | ) 42 | lazy_agent = agent.make_lazy_agent() 43 | sampler.start_sampling( 44 | lazy_agent, worker_episodes=worker_episodes) 45 | sample_result = sampler.store_samples(timeout=1e8) 46 | 47 | # GIVEN the store_samples function with infinite timeout 48 | # WHEN worker_episodes are specified 49 | # THEN sampler collects samples by the num of num_workers * worker_episodes 50 | assert len(sample_result[StartInfo()]["frames"] 51 | ) == num_workers * worker_episodes 52 | 53 | 54 | def test_sampler_frames(setUp): 55 | env = setUp["env"] 56 | agent = setUp["agent"] 57 | 58 | num_workers = 3 59 | worker_frames = 50 60 | sampler = AsyncSampler( 61 | env, 62 | num_workers=num_workers, 63 | ) 64 | 65 | lazy_agent = agent.make_lazy_agent() 66 | sampler.start_sampling( 67 | lazy_agent, worker_frames=worker_frames) 68 | sample_result = sampler.store_samples(timeout=1e8) 69 | 70 | # GIVEN the store_samples function with infinite timeout 71 | # WHEN worker_frames are specified 72 | # THEN sampler collects samples until frames exceeds worker_frames * num_workers 73 | assert sum(sample_result[StartInfo()]["frames"] 74 | ) > worker_frames * num_workers 75 | 76 | 77 | def test_ray_wait(setUp): 78 | env = setUp["env"] 79 | agent = setUp["agent"] 80 | sampler = AsyncSampler( 81 | env, 82 | num_workers=3, 83 | ) 84 | 85 | worker_episodes = 100 86 | lazy_agent = agent.make_lazy_agent() 87 | sampler.start_sampling( 88 | lazy_agent, worker_episodes=worker_episodes) 89 | sampler.store_samples(timeout=0.1) 90 | 91 | # GIVEN the store_samples function with short timeout 92 | # WHEN worker_episodes is large 93 | # THEN sampler doesn't wait the worker finishes sampling 94 | assert len(sampler.replay_buffer) == 0 95 | 96 | 97 | def test_eval_sampler(setUp): 98 | env = setUp["env"] 99 | agent = setUp["agent"] 100 | sampler = AsyncSampler( 101 | env, 102 | num_workers=3, 103 | ) 104 | 105 | worker_episodes = 3 106 | lazy_agent = agent.make_lazy_agent() 107 | start_info = StartInfo(sample_frames=100, 108 | sample_episodes=1000, 109 | train_steps=10000) 110 | sampler.start_sampling( 111 | lazy_agent, 112 | worker_episodes=worker_episodes, 113 | start_info=start_info 114 | ) 115 | 116 | result = sampler.store_samples(timeout=1e9, evaluation=True) 117 | # when evaluation=True, sampler doesn't store samples to the replay_buffer 118 | assert len(sampler.replay_buffer) == 0 119 | 120 | result["info_list"] 121 | -------------------------------------------------------------------------------- /tests/utils/writer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/utils/writer/__init__.py -------------------------------------------------------------------------------- /tests/utils/writer/writer_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from rlil.utils.writer import ExperimentWriter 4 | from rlil.initializer import set_writer, get_writer 5 | from shutil import rmtree 6 | import pathlib 7 | import os 8 | import pandas as pd 9 | from tensorboard.backend.event_processing import event_accumulator 10 | 11 | 12 | @pytest.fixture() 13 | def init_writer(): 14 | writer = ExperimentWriter(agent_name="test_agent", 15 | env_name="test_env", 16 | exp_info="test_exp", 17 | sample_frame_interval=10, 18 | sample_episode_interval=100, 19 | train_step_interval=1000) 20 | 21 | # GIVEN sample_frame_interval == 10 22 | # WHEN add_scalar with step="sample_frames" is called 23 | # THEN sample_frames is saved every 10 samples 24 | for i in range(100): 25 | writer.sample_frames = i 26 | writer.add_scalar("test", i, step="sample_frames") 27 | 28 | # same test for sample_episodes 29 | for i in range(1000): 30 | writer.sample_episodes = i 31 | writer.add_scalar("test", i, step="sample_episodes") 32 | 33 | # same test for train_steps 34 | for i in range(10000): 35 | writer.train_steps = i 36 | writer.add_scalar("test", i, step="train_steps") 37 | 38 | set_writer(writer) 39 | 40 | # load events file 41 | test_path = pathlib.Path("runs/test_exp") 42 | for p in test_path.rglob("events*"): 43 | eventspath = p 44 | event_acc = event_accumulator.EventAccumulator( 45 | str(eventspath), size_guidance={'scalars': 0}) 46 | 47 | # test make dir 48 | assert os.path.isdir(str(test_path)) 49 | 50 | yield event_acc 51 | 52 | # rm test_exp dir 53 | writer.close() 54 | rmtree(str(test_path), ignore_errors=True) 55 | 56 | 57 | def read_scalars(event_acc): 58 | scalars = {} 59 | steps = {} 60 | 61 | for tag in event_acc.Tags()['scalars']: 62 | events = event_acc.Scalars(tag) 63 | scalars[tag] = [event.value for event in events] 64 | steps[tag] = [event.step for event in events] 65 | return steps, scalars 66 | 67 | 68 | def test_get_step_value(init_writer): 69 | writer = get_writer() 70 | writer.sample_frames = 1 71 | writer.sample_episodes = 2 72 | writer.train_steps = 3 73 | 74 | assert writer._get_step_value("sample_frames") == 1 75 | assert writer._get_step_value("sample_episodes") == 2 76 | assert writer._get_step_value("train_steps") == 3 77 | 78 | 79 | def test_add_scalar_interval(init_writer): 80 | writer = get_writer() 81 | writer.close() 82 | 83 | event_acc = init_writer 84 | event_acc.Reload() 85 | 86 | steps, scalars = read_scalars(event_acc) 87 | assert scalars['test_env/test/sample_frames'] == [ 88 | i for i in range(10, 100, 10)] 89 | 90 | assert scalars['test_env/test/sample_episodes'] == [ 91 | i for i in range(100, 1000, 100)] 92 | 93 | assert scalars['test_env/test/train_steps'] == [ 94 | i for i in range(1000, 10000, 1000)] 95 | 96 | 97 | def test_step_value(init_writer): 98 | writer = get_writer() 99 | writer.sample_frames = 1e9 100 | writer.add_scalar("test", 999, step="sample_frames", step_value=12345) 101 | writer.close() 102 | 103 | event_acc = init_writer 104 | event_acc.Reload() 105 | steps, scalars = read_scalars(event_acc) 106 | assert 12345 in steps["test_env/test/sample_frames"] 107 | 108 | 109 | def test_save_csv(init_writer): 110 | writer = get_writer() 111 | writer.sample_frames = 1e9 112 | writer.add_scalar("test", 500, step="sample_frames", save_csv=True) 113 | 114 | test_path = pathlib.Path("runs/test_exp") 115 | for p in test_path.rglob("*.csv"): 116 | csv_file = p 117 | 118 | csv_data = pd.read_csv(str(csv_file), names=["sample_frames", "return"]) 119 | assert csv_data["sample_frames"].tolist() == [1e9] 120 | --------------------------------------------------------------------------------