├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── assets
    ├── TensorBoard.gif
    ├── continuous.png
    ├── different_gait.gif
    ├── offline.png
    ├── online_il.png
    └── rs-mpc.gif
├── docker
    ├── Dockerfile
    └── run_docker.bash
├── rlil
    ├── __init__.py
    ├── agents
    │   ├── __init__.py
    │   ├── airl.py
    │   ├── base.py
    │   ├── bc.py
    │   ├── bcq.py
    │   ├── bear.py
    │   ├── brac.py
    │   ├── ddpg.py
    │   ├── gail.py
    │   ├── noisy_td3.py
    │   ├── ppo.py
    │   ├── rs_mpc.py
    │   ├── sac.py
    │   ├── td3.py
    │   ├── vac.py
    │   └── vae_bc.py
    ├── approximation
    │   ├── __init__.py
    │   ├── approximation.py
    │   ├── bcq_auto_encoder.py
    │   ├── checkpointer
    │   │   └── __init__.py
    │   ├── discriminator.py
    │   ├── dynamics.py
    │   ├── ensemble_q_continuous.py
    │   ├── feature_network.py
    │   ├── q_continuous.py
    │   ├── q_network.py
    │   ├── target
    │   │   ├── __init__.py
    │   │   ├── abstract.py
    │   │   ├── fixed.py
    │   │   ├── polyak.py
    │   │   └── trivial.py
    │   └── v_network.py
    ├── environments
    │   ├── __init__.py
    │   ├── action.py
    │   ├── base.py
    │   ├── data
    │   │   └── ant_half_front_legs.xml
    │   ├── gym.py
    │   ├── reward_fns.py
    │   ├── rlil_envs.py
    │   └── state.py
    ├── experiments
    │   ├── __init__.py
    │   ├── experiment.py
    │   └── trainer.py
    ├── initializer.py
    ├── memory
    │   ├── __init__.py
    │   ├── airl_wrapper.py
    │   ├── base.py
    │   ├── gae_wrapper.py
    │   ├── gail_wrapper.py
    │   ├── replay_buffer.py
    │   └── sqil_wrapper.py
    ├── nn
    │   └── __init__.py
    ├── policies
    │   ├── __init__.py
    │   ├── bcq_deterministic.py
    │   ├── deterministic.py
    │   ├── gaussian.py
    │   ├── soft_deterministic.py
    │   └── softmax.py
    ├── presets
    │   ├── __init__.py
    │   ├── continuous
    │   │   ├── __init__.py
    │   │   ├── airl.py
    │   │   ├── bc.py
    │   │   ├── bcq.py
    │   │   ├── bear.py
    │   │   ├── brac.py
    │   │   ├── ddpg.py
    │   │   ├── gail.py
    │   │   ├── models.py
    │   │   ├── noisy_td3.py
    │   │   ├── ppo.py
    │   │   ├── rs_mpc.py
    │   │   ├── sac.py
    │   │   ├── sqil.py
    │   │   ├── td3.py
    │   │   ├── vac.py
    │   │   └── vae_bc.py
    │   └── validate_agent.py
    ├── samplers
    │   ├── __init__.py
    │   ├── asyncsampler.py
    │   └── base.py
    └── utils
    │   ├── __init__.py
    │   ├── plots.py
    │   └── writer.py
├── runs
    └── .gitignore
├── scripts
    ├── __init__.py
    ├── continuous
    │   ├── offline.py
    │   ├── online.py
    │   ├── online_il.py
    │   └── watch_continuous.py
    ├── offline_continuous.bash
    ├── online_continuous.bash
    ├── online_il_continuous.bash
    ├── plot.py
    └── record_trajectory.py
├── setup.py
└── tests
    ├── __init__.py
    ├── agents
        └── __init__.py
    ├── approximation
        ├── __init__.py
        ├── bcq_encoder_test.py
        ├── dynamics_test.py
        ├── ensemble_q_continuous_test.py
        ├── feature_network_test.py
        ├── q_network_test.py
        └── v_network_test.py
    ├── benchmark
        ├── __init__.py
        ├── action_test.py
        ├── cpu_gpu_test.py
        ├── state_test.py
        └── train_test.py
    ├── conftest.py
    ├── environments
        ├── __init__.py
        ├── action_test.py
        ├── gym_test.py
        └── state_test.py
    ├── experiments
        ├── __init__.py
        ├── experiment_test.py
        └── trainer_test.py
    ├── memory
        ├── __init__.py
        ├── airl_wrapper_test.py
        ├── gae_wrapper_test.py
        ├── gail_wrapper_test.py
        ├── replay_buffer_test.py
        └── sqil_wrapper_test.py
    ├── mock_agent.py
    ├── nn
        ├── __init__.py
        └── nn_test.py
    ├── policies
        ├── __init__.py
        ├── bcq_deterministic_test.py
        ├── deterministic_test.py
        ├── gaussian_test.py
        ├── soft_deterministic_test.py
        └── softmax_test.py
    ├── presets
        ├── __init__.py
        ├── offline_continuous_test.py
        ├── online_continuous_test.py
        └── online_il_continuous_test.py
    ├── samplers
        ├── __init__.py
        └── asyncsampler_test.py
    └── utils
        └── writer
            ├── __init__.py
            └── writer_test.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # python
  2 | *.pyc
  3 | __pycache__
  4 | 
  5 | # build directories
  6 | /build
  7 | /dist
  8 | heavy_runs
  9 | scripts/runs/
 10 | 
 11 | # editor
 12 | .vscode
 13 | .idea
 14 | 
 15 | # non-committed code
 16 | local
 17 | legacy
 18 | /out
 19 | /others
 20 | 
 21 | # Byte-compiled / optimized / DLL files
 22 | __pycache__/
 23 | *.py[cod]
 24 | *$py.class
 25 | 
 26 | # C extensions
 27 | *.so
 28 | 
 29 | # Distribution / packaging
 30 | .Python
 31 | build/
 32 | develop-eggs/
 33 | dist/
 34 | downloads/
 35 | eggs/
 36 | .eggs/
 37 | lib/
 38 | lib64/
 39 | parts/
 40 | sdist/
 41 | var/
 42 | wheels/
 43 | pip-wheel-metadata/
 44 | share/python-wheels/
 45 | *.egg-info/
 46 | .installed.cfg
 47 | *.egg
 48 | MANIFEST
 49 | 
 50 | # PyInstaller
 51 | #  Usually these files are written by a python script from a template
 52 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 53 | *.manifest
 54 | *.spec
 55 | 
 56 | # Installer logs
 57 | pip-log.txt
 58 | pip-delete-this-directory.txt
 59 | 
 60 | # Unit test / coverage reports
 61 | htmlcov/
 62 | .tox/
 63 | .nox/
 64 | .coverage
 65 | .coverage.*
 66 | .cache
 67 | nosetests.xml
 68 | coverage.xml
 69 | *.cover
 70 | *.py,cover
 71 | .hypothesis/
 72 | .pytest_cache/
 73 | cover/
 74 | 
 75 | # Translations
 76 | *.mo
 77 | *.pot
 78 | 
 79 | # Django stuff:
 80 | *.log
 81 | local_settings.py
 82 | db.sqlite3
 83 | db.sqlite3-journal
 84 | 
 85 | # Flask stuff:
 86 | instance/
 87 | .webassets-cache
 88 | 
 89 | # Scrapy stuff:
 90 | .scrapy
 91 | 
 92 | # Sphinx documentation
 93 | docs/_build/
 94 | 
 95 | # Jupyter Notebook
 96 | .ipynb_checkpoints
 97 | 
 98 | # IPython
 99 | profile_default/
100 | ipython_config.py
101 | 
102 | # pyenv
103 | #   For a library or package, you might want to ignore these files since the code is
104 | #   intended to run in multiple environments; otherwise, check them in:
105 | # .python-version
106 | 
107 | # pipenv
108 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
109 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
110 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
111 | #   install all needed dependencies.
112 | #Pipfile.lock
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Toshinori Kitamura
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | install:
 2 | 	conda install torch torchvision
 3 | 	pip install tensorboard
 4 | 	pip install -e .
 5 | 
 6 | test:
 7 | 	pytest -v --benchmark-skip
 8 | 
 9 | benchmark:
10 | 	pytest -v --benchmark-only
11 | 
12 | autopep8:
13 | 	autopep8 --in-place --recursive . 
14 | 
15 | tensorboard:
16 | 	tensorboard --logdir runs
17 | 
18 | clean:
19 | 	rm -rf dist
20 | 	rm -rf build
21 | 
22 | build: clean
23 | 	python setup.py sdist bdist_wheel
24 | 
25 | deploy: lint test build
26 | 	twine upload dist/*
27 | 


--------------------------------------------------------------------------------
/assets/TensorBoard.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/assets/TensorBoard.gif


--------------------------------------------------------------------------------
/assets/continuous.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/assets/continuous.png


--------------------------------------------------------------------------------
/assets/different_gait.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/assets/different_gait.gif


--------------------------------------------------------------------------------
/assets/offline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/assets/offline.png


--------------------------------------------------------------------------------
/assets/online_il.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/assets/online_il.png


--------------------------------------------------------------------------------
/assets/rs-mpc.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/assets/rs-mpc.gif


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # This Dockerfile is based on https://github.com/ikeyasu/docker-reinforcement-learning
 2 | 
 3 | # To use cuda9.2 container, you need to install nvidia-driver >= 396.26
 4 | # See https://github.com/NVIDIA/nvidia-docker/wiki/CUDA#requirements
 5 | FROM syuntoku/rl_ws:latest
 6 | MAINTAINER syuntoku14 <syuntoku14@gmail.com>
 7 | 
 8 | RUN git clone git@github.com:syuntoku14/pytorch-rl-il.git
 9 | RUN cd pytorch-rl-il && pip install -e .
10 | 
11 | CMD ["bash"]
12 | WORKDIR /root


--------------------------------------------------------------------------------
/docker/run_docker.bash:
--------------------------------------------------------------------------------
 1 | # umask 0002 is to change the permission to a normal user
 2 | 
 3 | run_docker() {
 4 | docker run --rm -it \
 5 | 	-p 6080:6080 \
 6 | 	-p 8888:8888 \
 7 | 	-p 6006:6006 \
 8 | 	-p 5678:5678 \
 9 | 	-p 8265:8265 \
10 | 	-v ~/RL_ws:/root/RL_ws \
11 | 	-v ~/pytorch-rl-il:/root/pytorch-rl-il \
12 | 	-e DISPLAY=:0 \
13 | 	--name rl \
14 | 	--shm-size 256G \
15 | 	--entrypoint "" \
16 | 	syuntoku/rl_ws:rlil bash -c "umask 0002 && bash"
17 | }
18 | 
19 | run_docker_gpu() {
20 | docker run --rm -it \
21 | 	-p 6080:6080 \
22 | 	-p 8888:8888 \
23 | 	-p 6006:6006 \
24 | 	-p 5678:5678 \
25 | 	-p 8265:8265 \
26 | 	-v ~/RL_ws:/root/RL_ws \
27 | 	-v ~/pytorch-rl-il:/root/pytorch-rl-il \
28 | 	-e DISPLAY=:0 \
29 | 	--name rl \
30 | 	--shm-size 256G \
31 | 	-e NVIDIA_VISIBLE_DEVICES=all \
32 | 	-e NVIDIA_DRIVER_CAPABILITIES=all \
33 | 	--gpus=all \
34 | 	--entrypoint "" \
35 | 	syuntoku/rl_ws:rlil bash -c "umask 0002 && bash"
36 | }
37 | 
38 | getopts "n" OPT
39 | case $OPT in
40 | 	n ) echo "--runtime=nvidia"
41 | 		run_docker_gpu ;;
42 | 	? )	echo "Without gpu"
43 | 		run_docker ;;
44 | esac
45 | 


--------------------------------------------------------------------------------
/rlil/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/rlil/__init__.py


--------------------------------------------------------------------------------
/rlil/agents/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import Agent, LazyAgent
 2 | from .vac import VAC
 3 | from .ddpg import DDPG
 4 | from .sac import SAC
 5 | from .td3 import TD3
 6 | from .noisy_td3 import NoisyTD3
 7 | from .bcq import BCQ
 8 | from .bear import BEAR
 9 | from .brac import BRAC
10 | from .bc import BC
11 | from .vae_bc import VaeBC
12 | from .ppo import PPO
13 | from .gail import GAIL
14 | from .airl import AIRL
15 | from .rs_mpc import RsMPC
16 | 
17 | __all__ = [
18 |     "Agent",
19 |     "LazyAgent",
20 |     "VAC",
21 |     "DDPG",
22 |     "SAC",
23 |     "TD3",
24 |     "NoisyTD3",
25 |     "BCQ",
26 |     "BEAR",
27 |     "BRAC",
28 |     "BC",
29 |     "VaeBC",
30 |     "PPO",
31 |     "GAIL",
32 |     "AIRL",
33 |     "RsMPC"
34 | ]
35 | 


--------------------------------------------------------------------------------
/rlil/agents/airl.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from rlil.initializer import get_device, get_writer, get_replay_buffer
 3 | from rlil import nn
 4 | from .gail import GAIL
 5 | 
 6 | 
 7 | class AIRL(GAIL):
 8 |     """
 9 |     Adversarial inverse reinforcement learning (AIRL)
10 | 
11 |     AIRL is an inverse reinforcement learning algorithm based on 
12 |     adversarial learning. AIRL trains not only the reward function
13 |     but also the value function to make the reward function robust 
14 |     to changes in dynamics.
15 | 
16 |     Args:
17 |         base_agent (rlil.agent.Agent): 
18 |             An off-policy agent such as ddpg, td3, sac
19 |         minibatch_size (int): 
20 |             The number of experiences to sample in each discriminator update.
21 |         replay_start_size (int): Number of experiences in replay buffer when training begins.
22 |         update_frequency (int): Number of base_agent update per discriminator update
23 |     """
24 | 
25 |     def __init__(self,
26 |                  base_agent,
27 |                  minibatch_size=32,
28 |                  replay_start_size=5000,
29 |                  update_frequency=10,
30 |                  ):
31 |         # objects
32 |         self.base_agent = base_agent
33 |         self.replay_buffer = get_replay_buffer()
34 |         self.reward_fn = self.replay_buffer.reward_fn
35 |         self.value_fn = self.replay_buffer.value_fn
36 |         self.writer = get_writer()
37 |         self.device = get_device()
38 |         self.discrim_criterion = nn.BCELoss()
39 |         # hyperparameters
40 |         self.minibatch_size = minibatch_size
41 |         self.replay_start_size = replay_start_size
42 |         self.update_frequency = update_frequency
43 |         self._train_count = 0
44 | 
45 |     def train(self):
46 |         # train discriminator
47 |         if self.should_train():
48 |             samples, expert_samples = self.replay_buffer.sample_both(
49 |                 self.minibatch_size)
50 |             states, actions, _, next_states, _, _ = samples
51 |             exp_states, exp_actions, _, exp_next_states, _, _ = expert_samples
52 | 
53 |             fake = self.replay_buffer.discrim(states, actions, next_states)
54 |             real = self.replay_buffer.discrim(exp_states,
55 |                                               exp_actions,
56 |                                               exp_next_states)
57 |             discrim_loss = self.discrim_criterion(fake, torch.ones_like(fake)) + \
58 |                 self.discrim_criterion(real, torch.zeros_like(real))
59 | 
60 |             self.reward_fn.zero_grad()
61 |             self.value_fn.zero_grad()
62 |             discrim_loss.backward()
63 |             self.reward_fn.reinforce()
64 |             self.value_fn.reinforce()
65 | 
66 |             # additional debugging info
67 |             self.writer.add_scalar('airl/fake', fake.mean())
68 |             self.writer.add_scalar('airl/real', real.mean())
69 | 
70 |         # train base_agent
71 |         self.base_agent.train()
72 | 


--------------------------------------------------------------------------------
/rlil/agents/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from rlil.memory import ExperienceReplayBuffer
 3 | from rlil.initializer import get_n_step
 4 | from rlil.utils import Samples
 5 | 
 6 | 
 7 | class Agent(ABC):
 8 |     """
 9 |     Abstract agent class
10 |     """
11 | 
12 |     @abstractmethod
13 |     def act(self, state, reward=None):
14 |         """
15 |         Select an action for evaluation.
16 |         If the agent has a replay-buffer, state and reward are stored.
17 | 
18 |         Args:
19 |             state (rlil.environment.State): The environment state at the current timestep.
20 |             reward (torch.Tensor): The reward from the previous timestep.
21 | 
22 |         Returns:
23 |             rllib.Action: The action to take at the current timestep.
24 |         """
25 | 
26 |     @abstractmethod
27 |     def make_lazy_agent(self, evaluation=False):
28 |         """
29 |         Return a LazyAgent object for sampling or evaluation.
30 | 
31 |         Args:
32 |             evaluation (bool, optional): If evaluation==True, the returned
33 |             object act greedily. Defaults to False.
34 | 
35 |         Returns:
36 |             LazyAgent: The LazyAgent object for Sampler.
37 |         """
38 |         pass
39 | 
40 |     def train(self):
41 |         """
42 |         Update internal parameters
43 |         """
44 |         pass
45 | 
46 |     def load(self, dirname):
47 |         """
48 |         Load pretrained agent.
49 | 
50 |         Args:
51 |             dirname (str): Directory where the agent saved
52 |         """
53 |         pass
54 | 
55 | 
56 | class LazyAgent(ABC):
57 |     """ 
58 |     Agent class for Sampler.
59 |     """
60 | 
61 |     def __init__(self,
62 |                  evaluation=False,
63 |                  store_samples=True):
64 |         self._states = None
65 |         self._actions = None
66 |         self._evaluation = evaluation
67 |         self._store_samples = store_samples
68 |         self.replay_buffer = None
69 |         # for N step replay buffer
70 |         self._n_step, self._discount_factor = get_n_step()
71 |         if self._evaluation:
72 |             self._n_step = 1  # disable Nstep buffer when evaluation mode
73 | 
74 |     def set_replay_buffer(self, env):
75 |         self.replay_buffer = ExperienceReplayBuffer(
76 |             1e7, env, n_step=self._n_step,
77 |             discount_factor=self._discount_factor)
78 | 
79 |     def act(self, states, reward):
80 |         """
81 |         In the act function, the lazy_agent put a sample 
82 |         (last_state, last_action, reward, states) into self.replay_buffer.
83 |         Then, it outputs a corresponding action.
84 |         """
85 |         if self._store_samples:
86 |             assert self.replay_buffer is not None, \
87 |                 "Call self.set_replay_buffer(env) at lazy_agent initialization."
88 |             samples = Samples(self._states, self._actions, reward, states)
89 |             self.replay_buffer.store(samples)
90 | 
91 |     def compute_priorities(self, samples):
92 |         """
93 |         Compute priorities of the given samples. 
94 |         This method is useful for Apex implementation.
95 |         Args:
96 |             samples (rlil.utils.Samples)
97 |         """
98 |         return None
99 | 


--------------------------------------------------------------------------------
/rlil/agents/bc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.distributions.normal import Normal
 3 | from torch.nn.functional import mse_loss
 4 | from rlil.environments import State, action_decorator, Action
 5 | from rlil.initializer import get_device, get_writer, get_replay_buffer
 6 | from rlil import nn
 7 | from copy import deepcopy
 8 | from .base import Agent, LazyAgent
 9 | import os
10 | 
11 | 
12 | class BC(Agent):
13 |     """
14 |     Behavioral Cloning (BC)
15 | 
16 |     In behavioral cloning, the agent trains a classifier or regressor to
17 |     replicate the expert's policy using the training data 
18 |     both the encountered states and actions.
19 | 
20 |     Args:
21 |         policy (DeterministicPolicy): 
22 |             An Approximation of a deterministic policy.
23 |         minibatch_size (int): 
24 |             The number of experiences to sample in each training update.
25 |     """
26 | 
27 |     def __init__(self,
28 |                  policy,
29 |                  minibatch_size=32,
30 |                  ):
31 |         # objects
32 |         self.policy = policy
33 |         self.replay_buffer = get_replay_buffer()
34 |         self.writer = get_writer()
35 |         self.device = get_device()
36 |         # hyperparameters
37 |         self.minibatch_size = minibatch_size
38 | 
39 |     def act(self, states, reward):
40 |         self._states = states
41 |         self._actions = Action(self.policy.eval(states.to(self.device)))
42 |         return self._actions
43 | 
44 |     def train(self):
45 |         if self.should_train():
46 |             (states, actions, _, _, _, _) = self.replay_buffer.sample(
47 |                 self.minibatch_size)
48 |             policy_actions = Action(self.policy(states))
49 |             loss = mse_loss(policy_actions.features, actions.features)
50 |             self.policy.reinforce(loss)
51 |             self.writer.train_steps += 1
52 | 
53 |     def should_train(self):
54 |         return True
55 | 
56 |     def make_lazy_agent(self, *args, **kwargs):
57 |         model = deepcopy(self.policy.model)
58 |         return BCLazyAgent(model.to("cpu"), *args, **kwargs)
59 | 
60 |     def load(self, dirname):
61 |         for filename in os.listdir(dirname):
62 |             if filename == 'policy.pt':
63 |                 self.policy.model = torch.load(os.path.join(
64 |                     dirname, filename), map_location=self.device)
65 | 
66 | 
67 | class BCLazyAgent(LazyAgent):
68 |     """ 
69 |     Agent class for sampler.
70 |     """
71 | 
72 |     def __init__(self, policy_model, *args, **kwargs):
73 |         self._policy_model = policy_model
74 |         super().__init__(*args, **kwargs)
75 | 
76 |     def act(self, states, reward):
77 |         super().act(states, reward)
78 |         self._states = states
79 |         with torch.no_grad():
80 |             self._actions = Action(self._policy_model(states))
81 |         return self._actions
82 | 


--------------------------------------------------------------------------------
/rlil/agents/gail.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from rlil.initializer import get_device, get_writer, get_replay_buffer
 3 | from rlil import nn
 4 | from .base import Agent
 5 | 
 6 | 
 7 | class GAIL(Agent):
 8 |     """
 9 |     Generative adversarial imitation learning (GAIL)
10 | 
11 |     GAIL is composed of two neural networks, the policy (generator) network 
12 |     and the discriminator network. In the original paper (https://arxiv.org/abs/1606.03476),
13 |     the policy network is trained using TRPO.
14 | 
15 |     Args:
16 |         base_agent (rlil.agent.Agent): Agent to train the policy.
17 |         minibatch_size (int): 
18 |             The number of experiences to sample in each discriminator update.
19 |         replay_start_size (int): Number of experiences in replay buffer when training begins.
20 |         update_frequency (int): Number of base_agent update per discriminator update
21 |     """
22 | 
23 |     def __init__(self,
24 |                  base_agent,
25 |                  minibatch_size=32,
26 |                  replay_start_size=5000,
27 |                  update_frequency=10,
28 |                  ):
29 |         # objects
30 |         self.base_agent = base_agent
31 |         self.replay_buffer = get_replay_buffer()
32 |         self.discriminator = self.replay_buffer.discriminator
33 |         self.writer = get_writer()
34 |         self.device = get_device()
35 |         self.discrim_criterion = nn.BCELoss()
36 |         # hyperparameters
37 |         self.minibatch_size = minibatch_size
38 |         self.replay_start_size = replay_start_size
39 |         self.update_frequency = update_frequency
40 |         self._train_count = 0
41 | 
42 |     def act(self, *args, **kwargs):
43 |         return self.base_agent.act(*args, **kwargs)
44 | 
45 |     def train(self):
46 |         self._train_count += 1
47 |         # train discriminator
48 |         if self.should_train():
49 |             samples, expert_samples = self.replay_buffer.sample_both(
50 |                 self.minibatch_size)
51 |             states, actions, _, _, _, _ = samples
52 |             exp_states, exp_actions, _, _, _, _ = expert_samples
53 | 
54 |             fake = self.discriminator(
55 |                 torch.cat((states.features, actions.features), dim=1))
56 |             real = self.discriminator(
57 |                 torch.cat((exp_states.features, exp_actions.features), dim=1))
58 |             discrim_loss = self.discrim_criterion(fake, torch.ones_like(fake)) + \
59 |                 self.discrim_criterion(real, torch.zeros_like(real))
60 |             self.discriminator.reinforce(discrim_loss)
61 | 
62 |             # additional debugging info
63 |             self.writer.add_scalar('gail/fake', fake.mean())
64 |             self.writer.add_scalar('gail/real', real.mean())
65 | 
66 |         # train base_agent
67 |         self.base_agent.train()
68 | 
69 |     def should_train(self):
70 |         return len(self.replay_buffer) > self.replay_start_size and \
71 |             self._train_count % self.update_frequency == 0
72 | 
73 |     def make_lazy_agent(self, *args, **kwargs):
74 |         return self.base_agent.make_lazy_agent(*args, **kwargs)
75 | 
76 |     def load(self, dirname):
77 |         self.base_agent.load(dirname)
78 | 


--------------------------------------------------------------------------------
/rlil/agents/noisy_td3.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | from copy import deepcopy
  4 | from torch.distributions.normal import Normal
  5 | from rlil.environments import Action
  6 | from rlil.initializer import get_device, get_writer, get_replay_buffer
  7 | from rlil.memory import ExperienceReplayBuffer
  8 | from rlil import nn
  9 | from rlil.utils import Samples
 10 | from .td3 import TD3, LazyAgent
 11 | 
 12 | 
 13 | class NoisyTD3(TD3):
 14 |     """
 15 |     Twin Dueling DDPG (TD3) with noisy network.
 16 |     TD3: https://arxiv.org/abs/1802.09477
 17 |     Noisy Network: https://arxiv.org/abs/1706.10295
 18 | 
 19 |     Args:
 20 |         q_1 (QContinuous): An Approximation of the continuous action Q-function.
 21 |         q_2 (QContinuous): An Approximation of the continuous action Q-function.
 22 |         policy (DeterministicPolicy): An Approximation of a deterministic policy.
 23 |         discount_factor (float): Discount factor for future rewards.
 24 |         minibatch_size (int): The number of experiences to sample in each training update.
 25 |         noise_td3 (float): the amount of noise to add to each action in trick three.
 26 |         policy_update_td3 (int): Number of timesteps per training update the policy in trick two.
 27 |         replay_start_size (int): Number of experiences in replay buffer when training begins.
 28 |     """
 29 | 
 30 |     def __init__(self,
 31 |                  q_1,
 32 |                  q_2,
 33 |                  policy,
 34 |                  discount_factor=0.99,
 35 |                  minibatch_size=32,
 36 |                  noise_td3=0.2,
 37 |                  policy_update_td3=2,
 38 |                  replay_start_size=5000,
 39 |                  ):
 40 |         # objects
 41 |         self.q_1 = q_1
 42 |         self.q_2 = q_2
 43 |         self.policy = policy
 44 |         self.replay_buffer = get_replay_buffer()
 45 |         self.device = get_device()
 46 |         self.writer = get_writer()
 47 |         # hyperparameters
 48 |         self.replay_start_size = replay_start_size
 49 |         self.minibatch_size = minibatch_size
 50 |         self.discount_factor = discount_factor
 51 |         self._noise_td3 = Normal(
 52 |             0, noise_td3*torch.tensor(
 53 |                 (Action.action_space().high - Action.action_space().low) / 2,
 54 |                 dtype=torch.float32, device=self.device))
 55 | 
 56 |         self._policy_update_td3 = policy_update_td3
 57 |         self._states = None
 58 |         self._actions = None
 59 |         self._train_count = 0
 60 | 
 61 |     def act(self, states, reward=None):
 62 |         if reward is not None:
 63 |             samples = Samples(self._states, self._actions, reward, states)
 64 |             self.replay_buffer.store(samples)
 65 |         self._states = states
 66 |         actions = self.policy.no_grad(states.to(self.device))
 67 |         self._actions = Action(actions).to("cpu")
 68 |         return self._actions
 69 | 
 70 |     def make_lazy_agent(self,
 71 |                         evaluation=False,
 72 |                         store_samples=True):
 73 |         model = deepcopy(self.policy.model)
 74 |         model.apply(nn.perturb_noisy_layers)
 75 |         return NoisyTD3LazyAgent(model.to("cpu"),
 76 |                                  evaluation=evaluation,
 77 |                                  store_samples=store_samples)
 78 | 
 79 | 
 80 | class NoisyTD3LazyAgent(LazyAgent):
 81 |     """ 
 82 |     Agent class for sampler.
 83 |     """
 84 | 
 85 |     def __init__(self,
 86 |                  policy_model,
 87 |                  *args,
 88 |                  **kwargs):
 89 |         self._policy_model = policy_model
 90 |         super().__init__(*args, **kwargs)
 91 |         if self._evaluation:
 92 |             self._policy_model.eval()
 93 | 
 94 |     def act(self, states, reward):
 95 |         super().act(states, reward)
 96 |         self._states = states
 97 |         with torch.no_grad():
 98 |             actions = self._policy_model(states)
 99 |         self._actions = Action(actions)
100 |         return self._actions
101 | 


--------------------------------------------------------------------------------
/rlil/agents/vae_bc.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.distributions.normal import Normal
  3 | from torch.nn.functional import mse_loss
  4 | from rlil.environments import State, action_decorator, Action
  5 | from rlil.initializer import get_device, get_writer, get_replay_buffer
  6 | from rlil import nn
  7 | from copy import deepcopy
  8 | from .base import Agent, LazyAgent
  9 | import os
 10 | 
 11 | 
 12 | class VaeBC(Agent):
 13 |     """
 14 |     VAE Behavioral Cloning (VAE-BC)
 15 | 
 16 |     VaeBC is a behavioral cloning method used in BCQ, BEAR and BRAC.
 17 |     It replaces the NN regressor in BC implementation with a VAE. 
 18 |     This code is mainly for debugging.
 19 | 
 20 |     Args:
 21 |         encoder (BcqEncoder): An approximation of the encoder.
 22 |         decoder (BcqDecoder): An approximation of the decoder.
 23 |         minibatch_size (int): 
 24 |             The number of experiences to sample in each training update.
 25 |     """
 26 | 
 27 |     def __init__(self,
 28 |                  encoder,
 29 |                  decoder,
 30 |                  minibatch_size=100,
 31 |                  ):
 32 |         # objects
 33 |         self.encoder = encoder
 34 |         self.decoder = decoder
 35 |         self.replay_buffer = get_replay_buffer()
 36 |         self.writer = get_writer()
 37 |         self.device = get_device()
 38 |         # hyperparameters
 39 |         self.minibatch_size = minibatch_size
 40 | 
 41 |     def act(self, states, reward):
 42 |         # batch x num_decode x d
 43 |         vae_actions, _ = \
 44 |             self.decoder.decode_multiple(states.to(self.device), num_decode=10)
 45 |         # batch x d
 46 |         vae_actions = vae_actions.mean(1)
 47 |         return Action(vae_actions)
 48 | 
 49 |     def train(self):
 50 |         (states, actions, _, _, _, _) = self.replay_buffer.sample(
 51 |             self.minibatch_size)
 52 | 
 53 |         # train vae
 54 |         mean, log_var = self.encoder(
 55 |             states.to(self.device), actions.to(self.device))
 56 |         z = mean + (0.5 * log_var).exp() * torch.randn_like(log_var)
 57 |         vae_actions = Action(self.decoder(states, z))
 58 |         vae_mse = mse_loss(actions.features, vae_actions.features)
 59 |         vae_kl = nn.kl_loss_vae(mean, log_var)
 60 |         vae_loss = vae_mse + vae_kl
 61 |         self.decoder.reinforce(vae_loss)
 62 |         self.encoder.reinforce()
 63 |         self.writer.add_scalar('loss/vae/mse', vae_mse.detach())
 64 |         self.writer.add_scalar('loss/vae/kl', vae_kl.detach())
 65 |         self.writer.train_steps += 1
 66 | 
 67 |     def should_train(self):
 68 |         return True
 69 | 
 70 |     def make_lazy_agent(self, *args, **kwargs):
 71 |         decoder_model = deepcopy(self.decoder.model)
 72 |         return VaeBcLazyAgent(decoder_model.to("cpu"), *args, **kwargs)
 73 | 
 74 |     def load(self, dirname):
 75 |         for filename in os.listdir(dirname):
 76 |             if filename in ('encoder.pt'):
 77 |                 self.encoder.model = torch.load(os.path.join(dirname, filename),
 78 |                                                 map_location=self.device)
 79 |             if filename in ('decoder.pt'):
 80 |                 self.decoder.model = torch.load(os.path.join(dirname, filename),
 81 |                                                 map_location=self.device)
 82 | 
 83 | 
 84 | class VaeBcLazyAgent(LazyAgent):
 85 |     """ 
 86 |     Agent class for sampler.
 87 |     """
 88 | 
 89 |     def __init__(self, decoder_model, *args, **kwargs):
 90 |         self._decoder_model = decoder_model
 91 |         super().__init__(*args, **kwargs)
 92 | 
 93 |     def act(self, states, reward):
 94 |         super().act(states, reward)
 95 |         self._states = states
 96 |         with torch.no_grad():
 97 |             # batch x num_decode x d
 98 |             actions, _ = \
 99 |                 self._decoder_model.decode_multiple(states, num_decode=10)
100 |             # batch x d
101 |             self._actions = Action(actions.mean(1))
102 |         return self._actions
103 | 


--------------------------------------------------------------------------------
/rlil/approximation/__init__.py:
--------------------------------------------------------------------------------
 1 | from .approximation import Approximation
 2 | from .ensemble_q_continuous import EnsembleQContinuous
 3 | from .q_continuous import QContinuous
 4 | from .q_network import QNetwork
 5 | from .v_network import VNetwork
 6 | from .bcq_auto_encoder import BcqEncoder, BcqDecoder
 7 | from .discriminator import Discriminator
 8 | from .target import TargetNetwork, FixedTarget, PolyakTarget, TrivialTarget
 9 | from .checkpointer import Checkpointer, DummyCheckpointer, PeriodicCheckpointer
10 | from .feature_network import FeatureNetwork
11 | from .dynamics import Dynamics
12 | 
13 | 
14 | __all__ = [
15 |     "Approximation",
16 |     "EnsembleQContinuous",
17 |     "QContinuous",
18 |     "QNetwork",
19 |     "VNetwork",
20 |     "BcqEncoder",
21 |     "BcqDecoder",
22 |     "Discriminator",
23 |     "TargetNetwork",
24 |     "FixedTarget",
25 |     "PolyakTarget",
26 |     "TrivialTarget",
27 |     "Checkpointer",
28 |     "DummyCheckpointer",
29 |     "PeriodicCheckpointer",
30 |     "FeatureNetwork",
31 |     "Dynamics"
32 | ]
33 | 


--------------------------------------------------------------------------------
/rlil/approximation/checkpointer/__init__.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from abc import abstractmethod, ABC
 3 | import torch
 4 | import os
 5 | from rlil.initializer import get_writer
 6 | 
 7 | 
 8 | class Checkpointer(ABC):
 9 |     @abstractmethod
10 |     def init(self, model, filename):
11 |         pass
12 | 
13 |     @abstractmethod
14 |     def __call__(self):
15 |         pass
16 | 
17 | 
18 | class DummyCheckpointer(Checkpointer):
19 |     def init(self, *inputs):
20 |         pass
21 | 
22 |     def __call__(self):
23 |         pass
24 | 
25 | 
26 | class PeriodicCheckpointer(Checkpointer):
27 |     def __init__(self, frequency):
28 |         self.frequency = frequency
29 |         self._writer = get_writer()
30 |         self._log_dir = None
31 |         self._filename = None
32 |         self._model = None
33 | 
34 |     def init(self, model, log_dir, filename):
35 |         self._model = model
36 |         self._log_dir = log_dir
37 |         self._filename = filename
38 |         # Some builds of pytorch throw this unhelpful warning.
39 |         # We can safely disable it.
40 |         # https://discuss.pytorch.org/t/got-warning-couldnt-retrieve-source-code-for-container/7689/7
41 |         warnings.filterwarnings(
42 |             "ignore", message="Couldn't retrieve source code")
43 | 
44 |     def __call__(self):
45 |         # save pereodically
46 |         # if self._writer.train_steps % self.frequency == 0:
47 |         #     save_dir = os.path.join(self._log_dir, str(self._writer.train_steps))
48 |         #     if not os.path.exists(save_dir):
49 |         #         os.makedirs(save_dir)
50 |         #     torch.save(self._model, os.path.join(
51 |         #         save_dir, self._filename) + ".pt")
52 | 
53 |         if self._writer.train_steps % self.frequency == 0:
54 |             torch.save(self._model,
55 |                        os.path.join(self._log_dir, self._filename + ".pt"))
56 | 


--------------------------------------------------------------------------------
/rlil/approximation/discriminator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from rlil.nn import RLNetwork
 3 | from .approximation import Approximation
 4 | 
 5 | 
 6 | class Discriminator(Approximation):
 7 |     def __init__(
 8 |             self,
 9 |             model,
10 |             optimizer,
11 |             name='discriminator',
12 |             **kwargs
13 |     ):
14 |         model = DiscriminatorModule(model)
15 |         super().__init__(
16 |             model,
17 |             optimizer,
18 |             name=name,
19 |             **kwargs
20 |         )
21 | 
22 |     def expert_reward(self, features):
23 |         rew = torch.log(self.model(features)) - \
24 |             torch.log(1 - self.model(features))
25 |         return rew.squeeze().detach()
26 | 
27 | 
28 | class DiscriminatorModule(RLNetwork):
29 |     def forward(self, features):
30 |         return self.model(features)
31 | 


--------------------------------------------------------------------------------
/rlil/approximation/dynamics.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from rlil.environments import State
 3 | from rlil.nn import RLNetwork
 4 | from .approximation import Approximation
 5 | 
 6 | 
 7 | class Dynamics(Approximation):
 8 |     def __init__(
 9 |             self,
10 |             model,
11 |             optimizer,
12 |             name='dynamics',
13 |             **kwargs
14 |     ):
15 |         model = DynamicsModule(model)
16 |         super().__init__(
17 |             model,
18 |             optimizer,
19 |             name=name,
20 |             **kwargs
21 |         )
22 | 
23 | 
24 | class DynamicsModule(RLNetwork):
25 |     def forward(self, states, actions):
26 |         x = torch.cat((states.features.float(),
27 |                        actions.features.float()), dim=1)
28 |         diff_features = self.model(x)
29 |         next_features = states.features + diff_features
30 | 
31 |         return State(
32 |             next_features,
33 |             mask=states.mask,
34 |             info=states.info
35 |         )
36 | 


--------------------------------------------------------------------------------
/rlil/approximation/ensemble_q_continuous.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from rlil.nn import RLNetwork
 3 | from .approximation import Approximation
 4 | 
 5 | 
 6 | class EnsembleQContinuous(Approximation):
 7 |     def __init__(
 8 |             self,
 9 |             models: torch.nn.ModuleList,
10 |             optimizer,
11 |             name='ensemble_q',
12 |             **kwargs
13 |     ):
14 |         model = EnsembleQContinuousModule(models)
15 |         super().__init__(
16 |             model,
17 |             optimizer,
18 |             name=name,
19 |             **kwargs
20 |         )
21 | 
22 |     def q1(self, *args, **kwargs):
23 |         return self.model.q1(*args, **kwargs)
24 | 
25 | 
26 | class EnsembleQContinuousModule(RLNetwork):
27 |     def forward(self, states, actions):
28 |         all_qs = []
29 |         x = torch.cat((states.features.float(),
30 |                        actions.features.float()), dim=1)
31 |         for m in self.model:
32 |             all_qs.append((m(x).squeeze(-1)
33 |                            * states.mask.float()).unsqueeze(1))
34 |         all_qs = torch.cat(all_qs, dim=1)
35 |         return all_qs  # batch x num_q
36 | 
37 |     def q1(self, states, actions):
38 |         x = torch.cat((states.features.float(),
39 |                        actions.features.float()), dim=1)
40 |         return self.model[0](x).squeeze(-1) * states.mask.float()
41 | 


--------------------------------------------------------------------------------
/rlil/approximation/feature_network.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from rlil.environments import State
 3 | from .approximation import Approximation
 4 | 
 5 | 
 6 | class FeatureNetwork(Approximation):
 7 |     '''
 8 |     A special type of Approximation that accumulates gradients before backpropagating them.
 9 |     This is useful when features are shared between network heads.
10 | 
11 |     The __call__ function caches the computation graph and detaches the output.
12 |     Then, various functions approximators may backpropagate to the output.
13 |     The reinforce() function will then backpropagate the accumulated gradients on the output
14 |     through the original computation graph.
15 |     '''
16 | 
17 |     def __init__(self, model, optimizer=None, name='feature', **kwargs):
18 |         model = FeatureModule(model)
19 |         super().__init__(model, optimizer, name=name, **kwargs)
20 |         self._cache = []
21 |         self._out = []
22 | 
23 |     def __call__(self, states):
24 |         '''
25 |         Run a forward pass of the model and return the detached output.
26 | 
27 |         Args:
28 |             state (all.environment.State): An environment State
29 | 
30 |         Returns:
31 |             all.environment.State: An enviornment State with the computed features
32 |         '''
33 |         features = self.model(states)
34 |         graphs = features.raw
35 |         features._raw = graphs.detach()
36 |         features._raw.requires_grad = True
37 |         self._enqueue(graphs, features._raw)
38 |         return features
39 | 
40 |     def reinforce(self):
41 |         '''
42 |         Backward pass of the model.
43 |         '''
44 |         self._optimizer.zero_grad()
45 |         graphs, grads = self._dequeue()
46 |         graphs.backward(grads)
47 |         self.step()
48 | 
49 |     def _enqueue(self, features, out):
50 |         self._cache.append(features)
51 |         self._out.append(out)
52 | 
53 |     def _dequeue(self):
54 |         graphs = []
55 |         grads = []
56 |         for graph, out in zip(self._cache, self._out):
57 |             if out.grad is not None:
58 |                 graphs.append(graph)
59 |                 grads.append(out.grad)
60 |         self._cache = []
61 |         self._out = []
62 |         return torch.cat(graphs), torch.cat(grads)
63 | 
64 | 
65 | class FeatureModule(torch.nn.Module):
66 |     def __init__(self, model):
67 |         super().__init__()
68 |         self.model = model
69 | 
70 |     def forward(self, states):
71 |         features = self.model(states.features.float())
72 |         return State(
73 |             features,
74 |             mask=states.mask,
75 |             info=states.info
76 |         )
77 | 


--------------------------------------------------------------------------------
/rlil/approximation/q_continuous.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from rlil.nn import RLNetwork
 3 | from .approximation import Approximation
 4 | 
 5 | 
 6 | class QContinuous(Approximation):
 7 |     def __init__(
 8 |             self,
 9 |             model,
10 |             optimizer,
11 |             name='q',
12 |             **kwargs
13 |     ):
14 |         model = QContinuousModule(model)
15 |         super().__init__(
16 |             model,
17 |             optimizer,
18 |             name=name,
19 |             **kwargs
20 |         )
21 | 
22 | 
23 | class QContinuousModule(RLNetwork):
24 |     def forward(self, states, actions):
25 |         x = torch.cat((states.features.float(),
26 |                        actions.features.float()), dim=1)
27 |         return self.model(x).squeeze(-1) * states.mask.float()
28 | 


--------------------------------------------------------------------------------
/rlil/approximation/q_network.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from rlil.nn import RLNetwork
 3 | from .approximation import Approximation
 4 | 
 5 | 
 6 | class QNetwork(Approximation):
 7 |     def __init__(
 8 |             self,
 9 |             model,
10 |             optimizer,
11 |             name='q',
12 |             **kwargs
13 |     ):
14 |         model = QModule(model)
15 |         super().__init__(
16 |             model,
17 |             optimizer,
18 |             name=name,
19 |             **kwargs
20 |         )
21 | 
22 | 
23 | class QModule(RLNetwork):
24 |     def forward(self, states, actions=None):
25 |         values = super().forward(states)
26 |         if actions is None:
27 |             return values
28 |         return values.gather(1, actions.features.view(-1, 1)).squeeze(1)
29 | 


--------------------------------------------------------------------------------
/rlil/approximation/target/__init__.py:
--------------------------------------------------------------------------------
1 | from .abstract import TargetNetwork
2 | from .fixed import FixedTarget
3 | from .polyak import PolyakTarget
4 | from .trivial import TrivialTarget
5 | 


--------------------------------------------------------------------------------
/rlil/approximation/target/abstract.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod, ABC
 2 | 
 3 | 
 4 | class TargetNetwork(ABC):
 5 |     @abstractmethod
 6 |     def __call__(self, *inputs):
 7 |         pass
 8 | 
 9 |     @abstractmethod
10 |     def init(self, model):
11 |         pass
12 | 
13 |     @abstractmethod
14 |     def update(self):
15 |         pass
16 | 


--------------------------------------------------------------------------------
/rlil/approximation/target/fixed.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import torch
 3 | from .abstract import TargetNetwork
 4 | 
 5 | 
 6 | class FixedTarget(TargetNetwork):
 7 |     def __init__(self, update_frequency):
 8 |         self._source = None
 9 |         self._target = None
10 |         self._updates = 0
11 |         self._update_frequency = update_frequency
12 | 
13 |     def __call__(self, *inputs):
14 |         with torch.no_grad():
15 |             return self._target(*inputs)
16 | 
17 |     def init(self, model):
18 |         self._source = model
19 |         self._target = copy.deepcopy(model)
20 | 
21 |     def update(self):
22 |         self._updates += 1
23 |         if self._should_update():
24 |             self._target.load_state_dict(self._source.state_dict())
25 | 
26 |     def _should_update(self):
27 |         return self._updates % self._update_frequency == 0
28 | 


--------------------------------------------------------------------------------
/rlil/approximation/target/polyak.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import torch
 3 | from .abstract import TargetNetwork
 4 | 
 5 | 
 6 | class PolyakTarget(TargetNetwork):
 7 |     '''TargetNetwork that updates using polyak averaging'''
 8 | 
 9 |     def __init__(self, rate):
10 |         self._source = None
11 |         self._target = None
12 |         self._rate = rate
13 | 
14 |     def __call__(self, *inputs):
15 |         with torch.no_grad():
16 |             return self._target(*inputs)
17 | 
18 |     def init(self, model):
19 |         self._source = model
20 |         self._target = copy.deepcopy(model)
21 | 
22 |     def update(self):
23 |         for target_param, source_param in zip(self._target.parameters(), self._source.parameters()):
24 |             target_param.data.copy_(
25 |                 target_param.data * (1.0 - self._rate) +
26 |                 source_param.data * self._rate
27 |             )
28 | 


--------------------------------------------------------------------------------
/rlil/approximation/target/trivial.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .abstract import TargetNetwork
 3 | 
 4 | 
 5 | class TrivialTarget(TargetNetwork):
 6 |     def __init__(self):
 7 |         self._target = None
 8 | 
 9 |     def __call__(self, *inputs):
10 |         with torch.no_grad():
11 |             return self._target(*inputs)
12 | 
13 |     def init(self, model):
14 |         self._target = model
15 | 
16 |     def update(self):
17 |         pass
18 | 


--------------------------------------------------------------------------------
/rlil/approximation/v_network.py:
--------------------------------------------------------------------------------
 1 | from rlil.nn import RLNetwork
 2 | from .approximation import Approximation
 3 | 
 4 | 
 5 | class VNetwork(Approximation):
 6 |     def __init__(
 7 |             self,
 8 |             model,
 9 |             optimizer,
10 |             name='v',
11 |             **kwargs
12 |     ):
13 |         model = VModule(model)
14 |         super().__init__(
15 |             model,
16 |             optimizer,
17 |             name=name,
18 |             **kwargs
19 |         )
20 | 
21 | 
22 | class VModule(RLNetwork):
23 |     def forward(self, states):
24 |         return super().forward(states).squeeze(-1)
25 | 


--------------------------------------------------------------------------------
/rlil/environments/reward_fns.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import gym
 3 | import torch
 4 | import numpy as np
 5 | 
 6 | 
 7 | class PendulumReward:
 8 |     def __call__(self, states, next_states, actions):
 9 |         # reward function of Pendulum-v0
10 |         thetas = torch.atan2(states.features[:, 1], states.features[:, 0])
11 |         theta_dots = states.features[:, 2]
12 | 
13 |         def angle_normalize(x):
14 |             return (((x+np.pi) % (2*np.pi)) - np.pi)
15 | 
16 |         costs = angle_normalize(thetas) ** 2 \
17 |             + .1 * theta_dots ** 2 \
18 |             + .001*(actions.features.squeeze()**2)
19 |         return -costs
20 | 
21 | 
22 | class MountainCarContinuousReward:
23 |     def __init__(self):
24 |         self.goal_position = 0.45
25 |         self.goal_velocity = 0
26 | 
27 |     def __call__(self, states, next_states, actions):
28 |         positions = states.features[:, 0]
29 |         velocities = states.features[:, 1]
30 |         goals = (positions >= self.goal_position) & (
31 |             velocities >= self.goal_velocity)
32 | 
33 |         rewards = torch.zeros(len(states), dtype=torch.float32)
34 |         rewards += goals * 100.0
35 |         rewards -= actions.features[:, 0] ** 2 * 0.1
36 |         return rewards
37 | 


--------------------------------------------------------------------------------
/rlil/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | from .experiment import Experiment
2 | from .trainer import Trainer
3 | 
4 | __all__ = [
5 |     "Experiment",
6 |     "Trainer"
7 | ]
8 | 


--------------------------------------------------------------------------------
/rlil/experiments/experiment.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rlil.utils.writer import ExperimentWriter
 3 | from rlil.initializer import get_logger, get_writer, set_writer, set_logger, set_seed
 4 | from rlil.samplers import AsyncSampler
 5 | from .trainer import Trainer
 6 | import os
 7 | import logging
 8 | import json
 9 | import git
10 | import warnings
11 | 
12 | 
13 | class Experiment:
14 |     def __init__(
15 |             self,
16 |             agent_fn,
17 |             env,
18 |             agent_name=None,
19 |             args_dict={},
20 |             exp_info="default_experiments",
21 |             seed=0,
22 |             trains_per_episode=20,
23 |             num_workers=1,
24 |             num_workers_eval=1,
25 |             max_sample_frames=np.inf,
26 |             max_sample_episodes=np.inf,
27 |             max_train_steps=np.inf,
28 |             train_minutes=np.inf
29 |     ):
30 |         # set_seed
31 |         set_seed(seed)
32 | 
33 |         # set writer
34 |         if agent_name is None:
35 |             agent_name = agent_fn.__name__[1:].replace("_", "-")
36 |         writer = self._make_writer(agent_name, env.name, exp_info)
37 |         message = "\n# Experiment: " + exp_info
38 |         message += "  \n# Parameters:  \n"
39 |         message += json.dumps(args_dict, indent=4,
40 |                               sort_keys=True).replace("\n", "  \n")
41 | 
42 |         # write git diff
43 |         try:
44 |             repo = git.Repo('./')
45 |             t = repo.head.commit.tree
46 |             diff = repo.git.diff(t).replace("\n", "  \n")
47 |             message += "  \n# Git diff:  \n" + diff
48 |         except git.InvalidGitRepositoryError:
49 |             warnings.warn(
50 |                 "Current repository doesn't have .git. git diff is not recorded.")
51 | 
52 |         writer.add_text("exp_summary", message)
53 |         set_writer(writer)
54 | 
55 |         # set logger
56 |         logger = get_logger()
57 |         handler = logging.FileHandler(
58 |             os.path.join(writer.log_dir, "logger.log"))
59 |         fmt = logging.Formatter('%(levelname)s : %(asctime)s : %(message)s')
60 |         handler.setFormatter(fmt)
61 |         logger.addHandler(handler)
62 |         set_logger(logger)
63 | 
64 |         # save args
65 |         with open(os.path.join(writer.log_dir, "args.json"), mode="w") as f:
66 |             json.dump(args_dict, f)
67 | 
68 |         # start training
69 |         agent = agent_fn(env)
70 | 
71 |         sampler = AsyncSampler(env, num_workers=num_workers) \
72 |             if num_workers > 0 else None
73 |         eval_sampler = AsyncSampler(env, num_workers=num_workers_eval) \
74 |             if num_workers_eval > 0 else None
75 | 
76 |         trainer = Trainer(
77 |             agent=agent,
78 |             sampler=sampler,
79 |             eval_sampler=eval_sampler,
80 |             trains_per_episode=trains_per_episode,
81 |             max_sample_frames=max_sample_frames,
82 |             max_sample_episodes=max_sample_episodes,
83 |             max_train_steps=max_train_steps,
84 |             train_minutes=train_minutes
85 |         )
86 | 
87 |         trainer.start_training()
88 | 
89 |     def _make_writer(self, agent_name, env_name, exp_info):
90 |         return ExperimentWriter(agent_name=agent_name,
91 |                                 env_name=env_name,
92 |                                 exp_info=exp_info)
93 | 


--------------------------------------------------------------------------------
/rlil/initializer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import torch
  4 | import logging
  5 | from rlil.utils.writer import DummyWriter
  6 | 
  7 | os.environ["PYTHONWARNINGS"] = 'ignore:semaphore_tracker:UserWarning'
  8 | 
  9 | _DEBUG_MODE = False
 10 | 
 11 | 
 12 | def enable_debug_mode():
 13 |     global _DEBUG_MODE
 14 |     print("-----DEBUG_MODE: True-----")
 15 |     torch.autograd.set_detect_anomaly(True)
 16 |     _DEBUG_MODE = True
 17 | 
 18 | 
 19 | def disable_debug_mode():
 20 |     global _DEBUG_MODE
 21 |     print("-----DEBUG_MODE: False-----")
 22 |     _DEBUG_MODE = False
 23 | 
 24 | 
 25 | def is_debug_mode():
 26 |     global _DEBUG_MODE
 27 |     return _DEBUG_MODE
 28 | 
 29 | 
 30 | _DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 31 | 
 32 | 
 33 | def set_device(device):
 34 |     global _DEVICE
 35 |     _DEVICE = device
 36 | 
 37 | 
 38 | def get_device():
 39 |     return _DEVICE
 40 | 
 41 | 
 42 | _SEED = 0
 43 | 
 44 | 
 45 | def set_seed(seed):
 46 |     global _SEED
 47 |     np.random.seed(seed)
 48 |     torch.manual_seed(seed)
 49 |     if torch.cuda.is_available():
 50 |         torch.cuda.manual_seed_all(seed)
 51 |     _SEED = seed
 52 |     print("-----SEED: {}-----".format(_SEED))
 53 | 
 54 | 
 55 | def call_seed():
 56 |     global _SEED
 57 |     np.random.seed(_SEED)
 58 |     torch.manual_seed(_SEED)
 59 |     if torch.cuda.is_available():
 60 |         torch.cuda.manual_seed_all(_SEED)
 61 |     return _SEED
 62 | 
 63 | 
 64 | _WRITER = DummyWriter()
 65 | 
 66 | 
 67 | def set_writer(writer):
 68 |     global _WRITER
 69 |     _WRITER = writer
 70 | 
 71 | 
 72 | def get_writer():
 73 |     return _WRITER
 74 | 
 75 | 
 76 | _LOGGER = logging.getLogger(__name__)
 77 | 
 78 | 
 79 | def set_logger(logger):
 80 |     global _LOGGER
 81 |     _LOGGER = logger
 82 | 
 83 | 
 84 | def get_logger():
 85 |     return _LOGGER
 86 | 
 87 | 
 88 | _REPLAY_BUFFER = None
 89 | 
 90 | 
 91 | def set_replay_buffer(replay_buffer):
 92 |     global _REPLAY_BUFFER
 93 |     _REPLAY_BUFFER = replay_buffer
 94 | 
 95 | 
 96 | def get_replay_buffer():
 97 |     global _REPLAY_BUFFER
 98 |     if _REPLAY_BUFFER is None:
 99 |         raise ValueError("replay_buffer is not set")
100 |     return _REPLAY_BUFFER
101 | 
102 | 
103 | _ON_POLICY_MODE = False
104 | 
105 | 
106 | def enable_on_policy_mode():
107 |     global _ON_POLICY_MODE
108 |     _ON_POLICY_MODE = True
109 |     print("-----ON_POLICY_MODE: {}-----".format(_ON_POLICY_MODE))
110 | 
111 | 
112 | def disable_on_policy_mode():
113 |     global _ON_POLICY_MODE
114 |     _ON_POLICY_MODE = False
115 |     print("-----ON_POLICY_MODE: {}-----".format(_ON_POLICY_MODE))
116 | 
117 | 
118 | def is_on_policy_mode():
119 |     global _ON_POLICY_MODE
120 |     return _ON_POLICY_MODE
121 | 
122 | 
123 | # parameters of NstepExperienceReplay
124 | _NSTEP = 1
125 | _DISCOUNT_FACTOR = 0.95
126 | 
127 | 
128 | def set_n_step(n_step, discount_factor=0.95):
129 |     global _NSTEP, _DISCOUNT_FACTOR
130 |     _NSTEP = n_step
131 |     _DISCOUNT_FACTOR = discount_factor
132 |     print("-----N step: {}-----".format(_NSTEP))
133 |     print("-----Discount factor: {}-----".format(_DISCOUNT_FACTOR))
134 | 
135 | 
136 | def get_n_step():
137 |     global _NSTEP, _DISCOUNT_FACTOR
138 |     return _NSTEP, _DISCOUNT_FACTOR
139 | 
140 | 
141 | _USE_APEX = False
142 | 
143 | 
144 | def enable_apex():
145 |     global _USE_APEX
146 |     _USE_APEX = True
147 |     print("-----USE_APEX: {}-----".format(_USE_APEX))
148 | 
149 | 
150 | def disable_apex():
151 |     global _USE_APEX
152 |     _USE_APEX = False
153 |     print("-----USE_APEX: {}-----".format(_USE_APEX))
154 | 
155 | 
156 | def use_apex():
157 |     global _USE_APEX
158 |     return _USE_APEX
159 | 


--------------------------------------------------------------------------------
/rlil/memory/__init__.py:
--------------------------------------------------------------------------------
 1 | from .replay_buffer import (
 2 |     BaseReplayBuffer,
 3 |     ExperienceReplayBuffer,
 4 | )
 5 | from .gail_wrapper import GailWrapper
 6 | from .gae_wrapper import GaeWrapper
 7 | from .sqil_wrapper import SqilWrapper
 8 | from .airl_wrapper import AirlWrapper
 9 | from cpprb import ReplayBuffer
10 | 
11 | 
12 | __all__ = [
13 |     "ReplayBuffer",
14 |     "BaseReplayBuffer",
15 |     "ExperienceReplayBuffer",
16 |     "GailWrapper",
17 |     "GaeWrapper",
18 |     "SqilWrapper",
19 |     "AirlWrapper"
20 | ]
21 | 


--------------------------------------------------------------------------------
/rlil/memory/airl_wrapper.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | import numpy as np
 3 | import torch
 4 | from rlil.environments import State, Action
 5 | from rlil.initializer import get_device, is_debug_mode
 6 | from .gail_wrapper import GailWrapper
 7 | 
 8 | 
 9 | class AirlWrapper(GailWrapper):
10 |     """
11 |     A wrapper of ExperienceReplayBuffer for rlil.agents.AIRL.
12 |     """
13 | 
14 |     def __init__(self,
15 |                  buffer,
16 |                  expert_buffer,
17 |                  reward_fn,
18 |                  value_fn,
19 |                  policy,
20 |                  feature_nw=None,
21 |                  discount_factor=1.0):
22 |         """
23 |         Args:
24 |             buffer (rlil.memory.ExperienceReplayBuffer): 
25 |                 A replay_buffer for sampling.
26 |             expert_buffer (rlil.memory.ExperienceReplayBuffer):
27 |                 A replay_buffer with expert trajectories.
28 |             reward_fn (rlil.approximation.Approximation):
29 |                 A reward function approximation.
30 |             value_fn (rlil.approximation.Approximation):
31 |                 A value function approximation.
32 |             policy (rlil.policies):
33 |                 A policy approximation
34 |             feature_nw (rlil.approximation.FeatureNetwork)
35 |         """
36 |         self.buffer = buffer
37 |         self.expert_buffer = expert_buffer
38 |         self.device = get_device()
39 |         self.reward_fn = reward_fn
40 |         self.value_fn = value_fn
41 |         self.policy = policy
42 |         self.feature_nw = feature_nw
43 |         self.discount_factor = discount_factor
44 | 
45 |     def sample(self, batch_size):
46 |         # replace the rewards with gail rewards
47 |         states, actions, rewards, next_states, weights, indexes = \
48 |             self.buffer.sample(batch_size)
49 | 
50 |         ds = self.discrim(states, actions, next_states)
51 |         rewards = self.expert_reward(ds)
52 |         return (states, actions, rewards, next_states, weights, indexes)
53 | 
54 |     def discrim(self, states, actions, next_states):
55 |         if self.feature_nw is None:
56 |             features = states
57 |         else:
58 |             features = self.feature_nw.no_grad(states)
59 |         policy_prob = self.policy.no_grad(features).log_prob(
60 |             actions.features).exp()
61 | 
62 |         f = self.reward_fn(
63 |             torch.cat((states.features, actions.features), dim=1)).squeeze(1) \
64 |             + next_states.mask.float() \
65 |             * (self.discount_factor * self.value_fn(next_states)
66 |                - self.value_fn(states))
67 |         f_exp = f.exp()
68 |         d = f_exp / (f_exp + policy_prob)
69 |         return d
70 | 
71 |     def expert_reward(self, d):
72 |         return (torch.log(d) - torch.log(1 - d)).squeeze().detach()
73 | 


--------------------------------------------------------------------------------
/rlil/memory/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class BaseReplayBuffer(ABC):
 5 |     @abstractmethod
 6 |     def store(self, states, actions, rewards, next_states):
 7 |         """Store the transition in the buffer
 8 |         Args:
 9 |             states (rlil.environment.State): batch_size x shape
10 |             actions (rlil.environment.Action): batch_size x shape
11 |             rewards (torch.Tensor): batch_size
12 |             next_states (rlil.environment.State): batch_size x shape
13 |         """
14 | 
15 |     @abstractmethod
16 |     def sample(self, batch_size):
17 |         '''Sample from the stored transitions'''
18 | 
19 |     @abstractmethod
20 |     def update_priorities(self, indexes, td_errors):
21 |         '''Update priorities based on the TD error'''
22 | 
23 |     @abstractmethod
24 |     def get_all_transitions(self):
25 |         '''Return all the samples'''
26 | 
27 |     @abstractmethod
28 |     def clear(self):
29 |         '''Clear replay buffer'''
30 | 
31 | 
32 | class BaseBufferWrapper(ABC):
33 |     def __init__(self, buffer):
34 |         self.buffer = buffer
35 | 
36 |     def store(self, *args, **kwargs):
37 |         self.buffer.store(*args, **kwargs)
38 | 
39 |     def sample(self, *args, **kwargs):
40 |         return self.buffer.sample(*args, **kwargs)
41 | 
42 |     def update_priorities(self, *args, **kwargs):
43 |         self.buffer.update_priorities(*args, **kwargs)
44 | 
45 |     def clear(self):
46 |         self.buffer.clear()
47 | 
48 |     def get_all_transitions(self):
49 |         return self.buffer.get_all_transitions()
50 | 
51 |     def samples_from_cpprb(self, *args, **kwargs):
52 |         return self.buffer.samples_from_cpprb(*args, **kwargs)
53 | 
54 |     def __len__(self):
55 |         return len(self.buffer)
56 | 


--------------------------------------------------------------------------------
/rlil/memory/gae_wrapper.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | import numpy as np
 3 | import torch
 4 | from rlil.environments import State, Action
 5 | from rlil.initializer import get_device, is_debug_mode
 6 | from .replay_buffer import ExperienceReplayBuffer
 7 | from .base import BaseBufferWrapper
 8 | 
 9 | 
10 | class GaeWrapper(BaseBufferWrapper):
11 |     """
12 |     A wrapper of ExperienceReplayBuffer for Generalized Advantage Estimation.
13 |     https://arxiv.org/abs/1506.02438
14 |     """
15 | 
16 |     def __init__(self, buffer, discount_factor=1, lam=1):
17 |         """
18 |         Args:
19 |             buffer (rlil.memory.ExperienceReplayBuffer): 
20 |                 A replay_buffer for sampling.
21 |         """
22 |         self.buffer = buffer
23 |         self.device = get_device()
24 |         self.discount_factor = discount_factor
25 |         self.lam = lam
26 | 
27 |     def compute_gae(self, rewards, values, next_values, masks):
28 |         td_errors = rewards + self.discount_factor * next_values - values
29 | 
30 |         # compute_gaes
31 |         length = len(td_errors)
32 |         gaes = torch.zeros(length, device=self.device)
33 | 
34 |         gae = 0.0
35 |         for i in reversed(range(length)):
36 |             mask = masks[i].float()
37 |             gae = td_errors[i] + self.discount_factor * self.lam * gae * mask
38 |             gaes[i] = gae
39 | 
40 |         # normalize Advantage
41 |         # see: https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/issues/102
42 |         gaes = (gaes - gaes.mean()) / gaes.std()
43 |         return gaes


--------------------------------------------------------------------------------
/rlil/memory/gail_wrapper.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | import numpy as np
 3 | import torch
 4 | from rlil.environments import State, Action
 5 | from rlil.initializer import get_device, is_debug_mode
 6 | from .replay_buffer import ExperienceReplayBuffer
 7 | from .base import BaseBufferWrapper
 8 | from .gae_wrapper import GaeWrapper
 9 | 
10 | 
11 | class GailWrapper(BaseBufferWrapper):
12 |     """
13 |     A wrapper of ExperienceReplayBuffer for rlil.agents.GAIL.
14 |     """
15 | 
16 |     def __init__(self, buffer, expert_buffer, discriminator):
17 |         """
18 |         Args:
19 |             buffer (rlil.memory.ExperienceReplayBuffer): 
20 |                 A replay_buffer for sampling.
21 |             expert_buffer (rlil.memory.ExperienceReplayBuffer):
22 |                 A replay_buffer with expert trajectories.
23 |             discriminator (rlil.approximation.Discriminator):
24 |                 A discriminator approximation.
25 |         """
26 |         self.buffer = buffer
27 |         self.expert_buffer = expert_buffer
28 |         self.device = get_device()
29 |         self.discriminator = discriminator
30 | 
31 |     def sample(self, batch_size):
32 |         # replace the rewards with gail rewards
33 |         states, actions, rewards, next_states, weights, indexes = \
34 |             self.buffer.sample(batch_size)
35 | 
36 |         rewards = self.discriminator.expert_reward(
37 |             torch.cat((states.features, actions.features), dim=1))
38 |         return (states, actions, rewards, next_states, weights, indexes)
39 | 
40 |     def sample_both(self, batch_size):
41 |         batch_size = int(batch_size / 2)
42 |         samples = self.buffer.sample(batch_size)
43 |         expert_samples = self.expert_buffer.sample(batch_size)
44 |         return samples, expert_samples
45 | 
46 |     def get_all_transitions(self):
47 |         # return the sampled trajectories
48 |         # not including expert trajectories
49 |         return self.buffer.get_all_transitions()
50 | 
51 |     def compute_gae(self, *args, **kwargs):
52 |         # wrap function for GaeWrapper
53 |         if isinstance(self.buffer, GaeWrapper):
54 |             return self.buffer.compute_gae(*args, **kwargs)
55 | 
56 |     def clear(self):
57 |         self.buffer.clear()
58 | 
59 |     def __len__(self):
60 |         # return the number of sampled trajectories
61 |         # not including expert trajectories
62 |         return len(self.buffer)
63 | 


--------------------------------------------------------------------------------
/rlil/memory/sqil_wrapper.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | import numpy as np
 3 | import torch
 4 | from rlil.environments import State, Action
 5 | from rlil.initializer import get_device, is_debug_mode
 6 | from .replay_buffer import ExperienceReplayBuffer
 7 | from .base import BaseBufferWrapper
 8 | from .gae_wrapper import GaeWrapper
 9 | 
10 | 
11 | class SqilWrapper(BaseBufferWrapper):
12 |     """
13 |     SQIL is a behavior cloning method which regularizes the 
14 |     reward to sparse by giving the agent a constant 
15 |     reward of r = +1 for matching the demonstrated action in 
16 |     a demonstrated state, and giving the agent a constant reward
17 |     of r = 0 for all other behavior.
18 |     https://arxiv.org/abs/1905.11108
19 |     """
20 | 
21 |     def __init__(self, buffer, expert_buffer):
22 |         """
23 |         Args:
24 |             buffer (rlil.memory.ExperienceReplayBuffer): 
25 |                 A replay_buffer for sampling.
26 |             expert_buffer (rlil.memory.ExperienceReplayBuffer):
27 |                 A replay_buffer with expert trajectories.
28 |         """
29 |         self.buffer = buffer
30 |         self.expert_buffer = expert_buffer
31 |         self.device = get_device()
32 | 
33 |     def sample(self, batch_size):
34 |         batch_size = int(batch_size / 2)
35 |         states, actions, rewards, next_states, weights, indexes = \
36 |             self.buffer.sample(batch_size)
37 |         exp_states, exp_actions, exp_rewards, exp_next_states, \
38 |             exp_weights, exp_indexes = self.expert_buffer.sample(batch_size)
39 | 
40 |         rewards = torch.zeros_like(rewards, dtype=torch.float32,
41 |                                    device=self.device)
42 |         exp_rewards = torch.ones_like(exp_rewards, dtype=torch.float32,
43 |                                       device=self.device)
44 | 
45 |         states = State.from_list([states, exp_states])
46 |         actions = Action.from_list([actions, exp_actions])
47 |         rewards = torch.cat([rewards, exp_rewards], axis=0)
48 |         next_states = State.from_list([next_states, exp_next_states])
49 |         weights = torch.cat([weights, exp_weights], axis=0)
50 | 
51 |         # shuffle tensors
52 |         index = torch.randperm(len(rewards))
53 |         if indexes is None or exp_indexes is None:
54 |             indexes = None
55 |         else:
56 |             indexes = torch.cat([indexes, exp_indexes], axis=0)[index]
57 | 
58 |         return (states[index],
59 |                 actions[index],
60 |                 rewards[index],
61 |                 next_states[index],
62 |                 weights[index],
63 |                 indexes)
64 | 


--------------------------------------------------------------------------------
/rlil/policies/__init__.py:
--------------------------------------------------------------------------------
 1 | from .gaussian import GaussianPolicy
 2 | from .softmax import SoftmaxPolicy
 3 | from .deterministic import DeterministicPolicy
 4 | from .bcq_deterministic import BCQDeterministicPolicy
 5 | from .soft_deterministic import SoftDeterministicPolicy
 6 | 
 7 | __all__ = [
 8 |     "GaussianPolicy",
 9 |     "SoftmaxPolicy",
10 |     "DeterministicPolicy",
11 |     "BCQDeterministicPolicy",
12 |     "SoftDeterministicPolicy"
13 | ]
14 | 


--------------------------------------------------------------------------------
/rlil/policies/bcq_deterministic.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from rlil.environments import squash_action
 3 | from rlil.approximation import Approximation
 4 | from rlil.nn import RLNetwork
 5 | 
 6 | 
 7 | class BCQDeterministicPolicy(Approximation):
 8 |     def __init__(
 9 |             self,
10 |             model,
11 |             optimizer,
12 |             space,
13 |             phi=0.05,
14 |             name='policy',
15 |             **kwargs
16 |     ):
17 |         model = BCQDeterministicPolicyNetwork(model, space, phi)
18 |         super().__init__(
19 |             model,
20 |             optimizer,
21 |             name=name,
22 |             **kwargs
23 |         )
24 | 
25 | 
26 | class BCQDeterministicPolicyNetwork(RLNetwork):
27 |     def __init__(self, model, space, phi=0.05):
28 |         super().__init__(model)
29 |         self._tanh_scale = torch.tensor(
30 |             (space.high - space.low) / 2,
31 |             dtype=torch.float32, device=self.device)
32 |         self._tanh_mean = torch.tensor(
33 |             (space.high + space.low) / 2,
34 |             dtype=torch.float32, device=self.device)
35 |         self.phi = phi
36 | 
37 |     def forward(self, states, vae_actions):
38 |         x = torch.cat((states.features.float(),
39 |                        vae_actions.features.float()), dim=1)
40 |         actions = self.model(x) * states.mask.float().unsqueeze(-1)
41 |         actions = self.phi * \
42 |             squash_action(actions, self._tanh_scale, self._tanh_mean)
43 |         return vae_actions.features + actions
44 | 
45 |     def to(self, device):
46 |         self._tanh_mean = self._tanh_mean.to(device)
47 |         self._tanh_scale = self._tanh_scale.to(device)
48 |         return super().to(device)
49 | 


--------------------------------------------------------------------------------
/rlil/policies/deterministic.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from rlil.environments import squash_action
 3 | from rlil.approximation import Approximation
 4 | from rlil.nn import RLNetwork
 5 | 
 6 | 
 7 | class DeterministicPolicy(Approximation):
 8 |     def __init__(
 9 |             self,
10 |             model,
11 |             optimizer,
12 |             space,
13 |             name='policy',
14 |             **kwargs
15 |     ):
16 |         model = DeterministicPolicyNetwork(model, space)
17 |         super().__init__(
18 |             model,
19 |             optimizer,
20 |             name=name,
21 |             **kwargs
22 |         )
23 | 
24 | 
25 | class DeterministicPolicyNetwork(RLNetwork):
26 |     def __init__(self, model, space):
27 |         super().__init__(model)
28 |         self._tanh_scale = torch.tensor(
29 |             (space.high - space.low) / 2).to(self.device)
30 |         self._tanh_mean = torch.tensor(
31 |             (space.high + space.low) / 2).to(self.device)
32 | 
33 |     def forward(self, state):
34 |         return squash_action(super().forward(state),
35 |                              self._tanh_scale, self._tanh_mean)
36 | 
37 |     def to(self, device):
38 |         self._tanh_mean = self._tanh_mean.to(device)
39 |         self._tanh_scale = self._tanh_scale.to(device)
40 |         return super().to(device)
41 | 


--------------------------------------------------------------------------------
/rlil/policies/gaussian.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.distributions.independent import Independent
 3 | from torch.distributions.normal import Normal
 4 | from rlil.environments import squash_action
 5 | from rlil.approximation import Approximation
 6 | from rlil.nn import RLNetwork
 7 | 
 8 | 
 9 | class GaussianPolicy(Approximation):
10 |     def __init__(
11 |             self,
12 |             model,
13 |             optimizer,
14 |             space,
15 |             name='policy',
16 |             **kwargs
17 |     ):
18 |         super().__init__(
19 |             GaussianPolicyNetwork(model, space),
20 |             optimizer,
21 |             name=name,
22 |             **kwargs
23 |         )
24 | 
25 | 
26 | class GaussianPolicyNetwork(RLNetwork):
27 |     def __init__(self, model, space):
28 |         super().__init__(model)
29 |         self._action_dim = space.shape[0]
30 | 
31 |     def forward(self, state, return_mean=False):
32 |         outputs = super().forward(state)
33 |         means = outputs[:, :self._action_dim]
34 | 
35 |         if return_mean:
36 |             return means
37 | 
38 |         logvars = outputs[:, self._action_dim:]
39 |         std = logvars.exp_()
40 |         return Independent(Normal(means, std), 1)
41 | 
42 |     def to(self, device):
43 |         return super().to(device)
44 | 


--------------------------------------------------------------------------------
/rlil/policies/soft_deterministic.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import torch.nn.functional as F
 4 | from rlil.approximation import Approximation
 5 | from rlil.nn import RLNetwork
 6 | from rlil.environments import squash_action
 7 | 
 8 | 
 9 | class SoftDeterministicPolicy(Approximation):
10 |     def __init__(
11 |             self,
12 |             model,
13 |             optimizer,
14 |             space,
15 |             name="policy",
16 |             **kwargs
17 |     ):
18 |         model = SoftDeterministicPolicyNetwork(model, space)
19 |         super().__init__(model, optimizer, name=name, **kwargs)
20 | 
21 |     def sample_multiple(self, states, num_sample=10):
22 |         return self.model.sample_multiple(states, num_sample)
23 | 
24 |     def compute_log_prob(self, raw, normal):
25 |         return self.model.compute_log_prob(raw, normal)
26 | 
27 |     def mean_logvar(self, states):
28 |         return self.model.mean_logvar(states)
29 | 
30 | 
31 | class SoftDeterministicPolicyNetwork(RLNetwork):
32 |     def __init__(self, model, space):
33 |         super().__init__(model)
34 |         self._action_dim = space.shape[0]
35 |         self._tanh_scale = torch.tensor(
36 |             (space.high - space.low) / 2,
37 |             dtype=torch.float32, device=self.device)
38 |         self._tanh_mean = torch.tensor(
39 |             (space.high + space.low) / 2,
40 |             dtype=torch.float32, device=self.device)
41 | 
42 |     def forward(self, state, return_mean=False):
43 |         outputs = super().forward(state)
44 |         if return_mean:
45 |             means = outputs[:, 0: self._action_dim]
46 |             means = squash_action(means, self._tanh_scale, self._tanh_mean)
47 |             return means
48 | 
49 |         # make normal distribution
50 |         means = outputs[:, 0: self._action_dim]
51 |         logvars = outputs[:, self._action_dim:]
52 |         std = logvars.mul(0.5).exp_()
53 |         normal = torch.distributions.normal.Normal(means, std)
54 | 
55 |         # sample from the normal distribution
56 |         raw = normal.rsample()
57 |         log_prob = self.compute_log_prob(raw, normal)
58 | 
59 |         action = squash_action(raw, self._tanh_scale, self._tanh_mean)
60 |         return action, log_prob
61 | 
62 |     def sample_multiple(self, state, num_sample=10):
63 |         # this function is used in BEAR and BRAC training
64 |         outputs = super().forward(state)
65 | 
66 |         # make normal distribution
67 |         means = outputs[:, 0: self._action_dim]
68 |         repeated_means = torch.repeat_interleave(
69 |             means.unsqueeze(1), num_sample, 1)
70 |         logvars = outputs[:, self._action_dim:]
71 |         repeated_logvars = torch.repeat_interleave(
72 |             logvars.unsqueeze(1), num_sample, 1)
73 |         repeated_std = repeated_logvars.mul(0.5).exp_()
74 |         # batch x num_sample x d
75 |         normal = torch.distributions.normal.Normal(
76 |             repeated_means, repeated_std)
77 |         raw = normal.rsample()
78 |         action = squash_action(raw, self._tanh_scale, self._tanh_mean)
79 |         return action, raw
80 | 
81 |     def compute_log_prob(self, raw, normal):
82 |         # see openai spinningup for log_prob computation:
83 |         # https://github.com/openai/spinningup/blob/e76f3cc1dfbf94fe052a36082dbd724682f0e8fd/spinup/algos/pytorch/sac/core.py#L53
84 | 
85 |         log_prob = normal.log_prob(raw).sum(axis=-1)
86 |         log_prob -= (2*(np.log(2) - raw - F.softplus(-2*raw))).sum(axis=-1)
87 |         return log_prob
88 | 
89 |     def mean_logvar(self, state):
90 |         outputs = super().forward(state)
91 |         means = outputs[:, 0: self._action_dim]
92 |         logvars = outputs[:, self._action_dim:]
93 |         return means, logvars
94 | 
95 |     def to(self, device):
96 |         self._tanh_mean = self._tanh_mean.to(device)
97 |         self._tanh_scale = self._tanh_scale.to(device)
98 |         return super().to(device)
99 | 


--------------------------------------------------------------------------------
/rlil/policies/softmax.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional
 3 | from rlil.nn import RLNetwork
 4 | from rlil.approximation import Approximation
 5 | 
 6 | 
 7 | class SoftmaxPolicy(Approximation):
 8 |     def __init__(
 9 |             self,
10 |             model,
11 |             optimizer,
12 |             name='policy',
13 |             **kwargs
14 |     ):
15 |         model = SoftmaxPolicyNetwork(model)
16 |         super().__init__(model, optimizer, name=name, **kwargs)
17 | 
18 | 
19 | class SoftmaxPolicyNetwork(RLNetwork):
20 |     def __init__(self, model):
21 |         super().__init__(model)
22 | 
23 |     def forward(self, state):
24 |         outputs = super().forward(state)
25 |         probs = functional.softmax(outputs, dim=-1)
26 |         return torch.distributions.Categorical(probs)
27 | 


--------------------------------------------------------------------------------
/rlil/presets/__init__.py:
--------------------------------------------------------------------------------
 1 | from rlil.presets.validate_agent import env_validation, trainer_validation
 2 | import inspect
 3 | 
 4 | __all__ = ["env_validation", "trainer_validation"]
 5 | 
 6 | 
 7 | def get_default_args(func):
 8 |     signature = inspect.signature(func)
 9 |     return {
10 |         k: v.default
11 |         for k, v in signature.parameters.items()
12 |         if v.default is not inspect.Parameter.empty
13 |     }


--------------------------------------------------------------------------------
/rlil/presets/continuous/__init__.py:
--------------------------------------------------------------------------------
 1 | # from .actor_critic import actor_critic
 2 | from .vac import vac
 3 | from .ddpg import ddpg
 4 | from .sac import sac
 5 | from .td3 import td3
 6 | from .noisy_td3 import noisy_td3
 7 | from .ppo import ppo
 8 | from .bc import bc
 9 | from .vae_bc import vae_bc
10 | from .bcq import bcq
11 | from .bear import bear
12 | from .brac import brac
13 | from .gail import gail
14 | from .sqil import sqil
15 | from .airl import airl
16 | from .rs_mpc import rs_mpc
17 | 
18 | __all__ = ['vac',
19 |            'ddpg',
20 |            'sac',
21 |            'td3',
22 |            'noisy_td3',
23 |            'ppo',
24 |            'bcq',
25 |            'bear',
26 |            'brac',
27 |            'bc',
28 |            'vae_bc',
29 |            'gail',
30 |            'sqil',
31 |            'airl',
32 |            'rs_mpc']
33 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/airl.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim import Adam
 3 | from rlil.agents import AIRL
 4 | from rlil.initializer import get_device, set_replay_buffer, get_replay_buffer
 5 | from .models import fc_reward, fc_v
 6 | from rlil.approximation import Approximation, Discriminator, VNetwork
 7 | from rlil.memory import ExperienceReplayBuffer, AirlWrapper
 8 | 
 9 | 
10 | def airl(
11 |         transitions=None,
12 |         base_agent_fn=None,
13 |         # Common settings
14 |         discount_factor=0.98,
15 |         # Adam optimizer settings
16 |         lr_r=2e-4,
17 |         lr_v=2e-4,
18 |         # Training settings
19 |         minibatch_size=512,
20 |         update_frequency=1,
21 |         # Replay Buffer settings
22 |         replay_start_size=5000,
23 |         replay_buffer_size=1e6
24 | ):
25 |     """
26 |     Adversarial Inverse Reinforcement Learning (AIRL) control preset
27 | 
28 |     Args:
29 |         transitions:
30 |             dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 
31 |         base_agent_fn (function):
32 |             A function generated by a preset of an agent such as sac, td3, ddpg
33 |             Currently, the base_agent_fn must be ppo preset.
34 |         lr_r (float): Learning rate for the reward function network.
35 |         lr_v (float): Learning rate for the value function network.
36 |         update_frequency (int): Number of base_agent update per discriminator update.
37 |         minibatch_size (int): Number of experiences to sample in each discriminator update.
38 |         replay_start_size (int): Number of experiences in replay buffer when training begins.
39 |         replay_buffer_size (int): Maximum number of experiences to store in the replay buffer.
40 |     """
41 |     def _airl(env):
42 |         device = get_device()
43 | 
44 |         base_agent = base_agent_fn(env)
45 | 
46 |         reward_model = fc_reward(env).to(device)
47 |         reward_optimizer = Adam(reward_model.parameters(),
48 |                                 lr=lr_r)
49 |         reward_fn = Approximation(reward_model,
50 |                                   reward_optimizer,
51 |                                   name='airl_rew')
52 | 
53 |         value_model = fc_v(env).to(device)
54 |         value_optimizer = Adam(value_model.parameters(),
55 |                                lr=lr_v)
56 |         value_fn = VNetwork(value_model,
57 |                             value_optimizer,
58 |                             name='airl_v')
59 | 
60 |         expert_replay_buffer = ExperienceReplayBuffer(1e7, env)
61 |         if transitions is not None:
62 |             samples = expert_replay_buffer.samples_from_cpprb(
63 |                 transitions, device="cpu")
64 |             expert_replay_buffer.store(samples)
65 | 
66 |         replay_buffer = get_replay_buffer()
67 |         replay_buffer = AirlWrapper(buffer=replay_buffer,
68 |                                     expert_buffer=expert_replay_buffer,
69 |                                     reward_fn=reward_fn,
70 |                                     value_fn=value_fn,
71 |                                     policy=base_agent.policy,
72 |                                     feature_nw=base_agent.feature_nw,
73 |                                     discount_factor=discount_factor)
74 |         set_replay_buffer(replay_buffer)
75 | 
76 |         # replace base_agent's replay_buffer with gail_buffer
77 |         base_agent.replay_buffer = replay_buffer
78 | 
79 |         return AIRL(
80 |             base_agent=base_agent,
81 |             minibatch_size=minibatch_size,
82 |             replay_start_size=replay_start_size,
83 |             update_frequency=update_frequency
84 |         )
85 |     return _airl
86 | 
87 | 
88 | __all__ = ["airl"]
89 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/bc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim import Adam
 3 | from rlil.agents import BC
 4 | from rlil.initializer import (get_device,
 5 |                               set_replay_buffer,
 6 |                               disable_on_policy_mode)
 7 | from rlil.policies import DeterministicPolicy
 8 | from rlil.memory import ExperienceReplayBuffer
 9 | from .models import fc_deterministic_policy
10 | 
11 | 
12 | def bc(
13 |         transitions=None,
14 |         # Adam optimizer settings
15 |         lr_pi=1e-3,
16 |         # Training settings
17 |         minibatch_size=100,
18 | ):
19 |     """
20 |     Behavioral Cloning (BC) control preset
21 | 
22 |     Args:
23 |         transitions:
24 |             dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 
25 |         lr_pi (float): Learning rate for the policy network.
26 |         minibatch_size (int): Number of experiences to sample in each training update.
27 |     """
28 |     def _bc(env):
29 |         disable_on_policy_mode()
30 |         device = get_device()
31 | 
32 |         policy_model = fc_deterministic_policy(env).to(device)
33 |         policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
34 |         policy = DeterministicPolicy(
35 |             policy_model,
36 |             policy_optimizer,
37 |             env.action_space,
38 |         )
39 | 
40 |         replay_buffer = ExperienceReplayBuffer(1e7, env)
41 |         if transitions is not None:
42 |             samples = replay_buffer.samples_from_cpprb(
43 |                 transitions, device="cpu")
44 |             replay_buffer.store(samples)
45 |         set_replay_buffer(replay_buffer)
46 | 
47 |         return BC(
48 |             policy=policy,
49 |             minibatch_size=minibatch_size,
50 |         )
51 |     return _bc
52 | 
53 | 
54 | __all__ = ["bc"]
55 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/bear.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.optim import Adam
  4 | from rlil.agents import BEAR
  5 | from rlil.approximation import (EnsembleQContinuous,
  6 |                                 PolyakTarget,
  7 |                                 BcqEncoder,
  8 |                                 BcqDecoder)
  9 | from rlil.policies import SoftDeterministicPolicy
 10 | from rlil.memory import ExperienceReplayBuffer
 11 | from rlil.initializer import (get_device,
 12 |                               set_replay_buffer,
 13 |                               disable_on_policy_mode)
 14 | from .models import (fc_q,
 15 |                      fc_soft_policy,
 16 |                      fc_bcq_encoder,
 17 |                      fc_bcq_decoder)
 18 | 
 19 | 
 20 | def bear(
 21 |         transitions=None,
 22 |         # Common settings
 23 |         discount_factor=0.99,
 24 |         # Adam optimizer settings
 25 |         lr_q=1e-3,
 26 |         lr_pi=1e-3,
 27 |         lr_enc=1e-3,
 28 |         lr_dec=1e-3,
 29 |         # Training settings
 30 |         minibatch_size=100,
 31 |         polyak_rate=0.005,
 32 |         # BEAR settings
 33 |         num_qs=2,
 34 |         kernel_type="laplacian",
 35 | ):
 36 |     """
 37 |     Bootstrapping error accumulation reduction (BEAR) control preset
 38 | 
 39 |     Args:
 40 |         transitions:
 41 |             dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 
 42 |         discount_factor (float): Discount factor for future rewards.
 43 |         lr_q (float): Learning rate for the Q network.
 44 |         lr_pi (float): Learning rate for the policy network.
 45 |         lr_enc (float): Learning rate for the encoder.
 46 |         lr_dec (float): Learning rate for the decoder.
 47 |         minibatch_size (int): Number of experiences to sample in each training update.
 48 |         polyak_rate (float): Speed with which to update the target network towards the online network.
 49 |         num_qs (int): Number of q functions for ensemble.
 50 |     """
 51 |     def _bear(env):
 52 |         disable_on_policy_mode()
 53 | 
 54 |         device = get_device()
 55 |         q_models = nn.ModuleList([fc_q(env) for _ in range(num_qs)]).to(device)
 56 |         qs_optimizer = Adam(q_models.parameters(), lr=lr_q)
 57 |         qs = EnsembleQContinuous(
 58 |             q_models,
 59 |             qs_optimizer,
 60 |             target=PolyakTarget(polyak_rate),
 61 |             name='qs'
 62 |         )
 63 | 
 64 |         policy_model = fc_soft_policy(env).to(device)
 65 |         policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
 66 |         policy = SoftDeterministicPolicy(
 67 |             policy_model,
 68 |             policy_optimizer,
 69 |             env.action_space,
 70 |             target=PolyakTarget(polyak_rate),
 71 |         )
 72 | 
 73 |         latent_dim = env.action_space.shape[0] * 2
 74 |         encoder_model = fc_bcq_encoder(env, latent_dim=latent_dim).to(device)
 75 |         encoder_optimizer = Adam(encoder_model.parameters(), lr=lr_enc)
 76 |         encoder = BcqEncoder(
 77 |             model=encoder_model,
 78 |             latent_dim=latent_dim,
 79 |             optimizer=encoder_optimizer,
 80 |             name="encoder",
 81 |         )
 82 |         decoder_model = fc_bcq_decoder(env, latent_dim=latent_dim).to(device)
 83 |         decoder_optimizer = Adam(decoder_model.parameters(), lr=lr_dec)
 84 |         decoder = BcqDecoder(
 85 |             model=decoder_model,
 86 |             latent_dim=latent_dim,
 87 |             space=env.action_space,
 88 |             optimizer=decoder_optimizer,
 89 |             name="decoder",
 90 |         )
 91 | 
 92 |         replay_buffer = ExperienceReplayBuffer(1e7, env)
 93 |         if transitions is not None:
 94 |             samples = replay_buffer.samples_from_cpprb(
 95 |                 transitions, device="cpu")
 96 |             replay_buffer.store(samples)
 97 |         set_replay_buffer(replay_buffer)
 98 | 
 99 |         return BEAR(
100 |             qs=qs,
101 |             encoder=encoder,
102 |             decoder=decoder,
103 |             policy=policy,
104 |             kernel_type=kernel_type,
105 |             discount_factor=discount_factor,
106 |             minibatch_size=minibatch_size,
107 |         )
108 |     return _bear
109 | 
110 | 
111 | __all__ = ["bear"]
112 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/brac.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.optim import Adam
  4 | from rlil.agents import BRAC
  5 | from rlil.approximation import (QContinuous,
  6 |                                 PolyakTarget,
  7 |                                 BcqEncoder,
  8 |                                 BcqDecoder)
  9 | from rlil.policies import SoftDeterministicPolicy
 10 | from rlil.memory import ExperienceReplayBuffer
 11 | from rlil.initializer import (get_device,
 12 |                               set_replay_buffer,
 13 |                               disable_on_policy_mode)
 14 | from .models import (fc_q,
 15 |                      fc_soft_policy,
 16 |                      fc_bcq_encoder,
 17 |                      fc_bcq_decoder)
 18 | 
 19 | 
 20 | def brac(
 21 |         transitions=None,
 22 |         # Common settings
 23 |         discount_factor=0.99,
 24 |         # Adam optimizer settings
 25 |         lr_q=1e-3,
 26 |         lr_pi=1e-3,
 27 |         # Training settings
 28 |         bc_iters=5000,
 29 |         minibatch_size=100,
 30 |         polyak_rate=0.005,
 31 |         alpha=0.1
 32 | ):
 33 |     """
 34 |     Bootstrapping error accumulation reduction (BEAR) control preset
 35 | 
 36 |     Args:
 37 |         transitions:
 38 |             dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 
 39 |         discount_factor (float): Discount factor for future rewards.
 40 |         lr_q (float): Learning rate for the Q network.
 41 |         lr_pi (float): Learning rate for the policy network.
 42 |         alpha (float): Value of lagrange multipliers. Trick 3.
 43 |         minibatch_size (int): Number of experiences to sample in each training update.
 44 |         polyak_rate (float): Speed with which to update the target network towards the online network.
 45 |     """
 46 |     def _brac(env):
 47 |         disable_on_policy_mode()
 48 | 
 49 |         device = get_device()
 50 |         q_1_model = fc_q(env).to(device)
 51 |         q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q)
 52 |         q_1 = QContinuous(
 53 |             q_1_model,
 54 |             q_1_optimizer,
 55 |             target=PolyakTarget(polyak_rate),
 56 |             name='q_1'
 57 |         )
 58 | 
 59 |         q_2_model = fc_q(env).to(device)
 60 |         q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q)
 61 |         q_2 = QContinuous(
 62 |             q_2_model,
 63 |             q_2_optimizer,
 64 |             target=PolyakTarget(polyak_rate),
 65 |             name='q_2'
 66 |         )
 67 | 
 68 |         policy_model = fc_soft_policy(env).to(device)
 69 |         policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
 70 |         policy = SoftDeterministicPolicy(
 71 |             policy_model,
 72 |             policy_optimizer,
 73 |             env.action_space,
 74 |             target=PolyakTarget(polyak_rate),
 75 |         )
 76 | 
 77 |         behavior_model = fc_soft_policy(env).to(device)
 78 |         behavior_optimizer = Adam(behavior_model.parameters(), lr=lr_pi)
 79 |         behavior_policy = SoftDeterministicPolicy(
 80 |             behavior_model,
 81 |             behavior_optimizer,
 82 |             env.action_space,
 83 |             target=PolyakTarget(polyak_rate),
 84 |             name='behavior_policy'
 85 |         )
 86 | 
 87 |         replay_buffer = ExperienceReplayBuffer(1e7, env)
 88 |         if transitions is not None:
 89 |             samples = replay_buffer.samples_from_cpprb(
 90 |                 transitions, device="cpu")
 91 |             replay_buffer.store(samples)
 92 |         set_replay_buffer(replay_buffer)
 93 | 
 94 |         return BRAC(
 95 |             q_1=q_1,
 96 |             q_2=q_2,
 97 |             policy=policy,
 98 |             behavior_policy=behavior_policy,
 99 |             bc_iters=bc_iters,
100 |             alpha=alpha,
101 |             discount_factor=discount_factor,
102 |             minibatch_size=minibatch_size,
103 |         )
104 |     return _brac
105 | 
106 | 
107 | __all__ = ["brac"]
108 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/ddpg.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim import Adam
 3 | from rlil.agents import DDPG
 4 | from rlil.approximation import QContinuous, PolyakTarget
 5 | from rlil.policies import DeterministicPolicy
 6 | from rlil.memory import ExperienceReplayBuffer
 7 | from rlil.initializer import (get_device,
 8 |                               set_replay_buffer,
 9 |                               disable_on_policy_mode,
10 |                               set_n_step,
11 |                               enable_apex)
12 | from .models import fc_q, fc_deterministic_policy
13 | 
14 | 
15 | def ddpg(
16 |         # Common settings
17 |         discount_factor=0.99,
18 |         # Adam optimizer settings
19 |         lr_q=1e-3,
20 |         lr_pi=1e-3,
21 |         # Training settings
22 |         minibatch_size=512,
23 |         polyak_rate=0.005,
24 |         # Replay Buffer settings
25 |         replay_start_size=5000,
26 |         replay_buffer_size=1e7,
27 |         prioritized=False,
28 |         use_apex=False,
29 |         n_step=1,
30 |         # Exploration settings
31 |         noise=0.1,
32 | ):
33 |     """
34 |     DDPG continuous control preset.
35 | 
36 |     Args:
37 |         discount_factor (float): Discount factor for future rewards.
38 |         lr_q (float): Learning rate for the Q network.
39 |         lr_pi (float): Learning rate for the policy network.
40 |         minibatch_size (int): Number of experiences to sample in each training update.
41 |         polyak_rate (float): Speed with which to update the target network towards the online network.
42 |         replay_start_size (int): Number of experiences in replay buffer when training begins.
43 |         replay_buffer_size (int): Maximum number of experiences to store in the replay buffer.
44 |         prioritized (bool): Use prioritized experience replay if True.
45 |         use_apex (bool): Use apex if True.
46 |         n_step (int): Number of steps for N step experience replay.
47 |         noise (float): The amount of exploration noise to add.
48 |     """
49 |     def _ddpg(env):
50 |         disable_on_policy_mode()
51 | 
52 |         device = get_device()
53 |         q_model = fc_q(env).to(device)
54 |         q_optimizer = Adam(q_model.parameters(), lr=lr_q)
55 |         q = QContinuous(
56 |             q_model,
57 |             q_optimizer,
58 |             target=PolyakTarget(polyak_rate),
59 |         )
60 | 
61 |         policy_model = fc_deterministic_policy(env).to(device)
62 |         policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
63 |         policy = DeterministicPolicy(
64 |             policy_model,
65 |             policy_optimizer,
66 |             env.action_space,
67 |             target=PolyakTarget(polyak_rate),
68 |         )
69 | 
70 |         if use_apex:
71 |             enable_apex()
72 |         set_n_step(n_step=n_step, discount_factor=discount_factor)
73 |         replay_buffer = ExperienceReplayBuffer(
74 |             replay_buffer_size, env,
75 |             prioritized=prioritized or use_apex)
76 |         set_replay_buffer(replay_buffer)
77 | 
78 |         return DDPG(
79 |             q,
80 |             policy,
81 |             noise=noise,
82 |             replay_start_size=replay_start_size,
83 |             discount_factor=discount_factor,
84 |             minibatch_size=minibatch_size,
85 |         )
86 |     return _ddpg
87 | 
88 | 
89 | __all__ = ["ddpg"]
90 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/gail.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim import Adam
 3 | from rlil.agents import GAIL
 4 | from rlil.initializer import get_device, set_replay_buffer, get_replay_buffer
 5 | from .models import fc_discriminator
 6 | from rlil.approximation import Discriminator
 7 | from rlil.memory import ExperienceReplayBuffer, GailWrapper
 8 | 
 9 | 
10 | def gail(
11 |         transitions=None,
12 |         base_agent_fn=None,
13 |         # Adam optimizer settings
14 |         lr_d=2e-4,
15 |         # Training settings
16 |         minibatch_size=512,
17 |         update_frequency=1,
18 |         # Replay Buffer settings
19 |         replay_start_size=5000,
20 |         replay_buffer_size=1e6
21 | ):
22 |     """
23 |     Generative Adversarial Imitation Learning (GAIL) control preset
24 | 
25 |     Args:
26 |         transitions:
27 |             dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 
28 |         base_agent_fn (function):
29 |             A function generated by a preset of an agent such as sac, td3, ddpg
30 |         lr_d (float): Learning rate for the discriminator network.
31 |         update_frequency (int): Number of base_agent update per discriminator update.
32 |         minibatch_size (int): Number of experiences to sample in each discriminator update.
33 |         replay_start_size (int): Number of experiences in replay buffer when training begins.
34 |         replay_buffer_size (int): Maximum number of experiences to store in the replay buffer.
35 |     """
36 |     def _gail(env):
37 |         device = get_device()
38 | 
39 |         base_agent = base_agent_fn(env)
40 | 
41 |         discriminator_model = fc_discriminator(env).to(device)
42 |         discriminator_optimizer = Adam(discriminator_model.parameters(),
43 |                                        lr=lr_d)
44 |         discriminator = Discriminator(discriminator_model,
45 |                                       discriminator_optimizer)
46 | 
47 |         expert_replay_buffer = ExperienceReplayBuffer(1e7, env)
48 |         if transitions is not None:
49 |             samples = expert_replay_buffer.samples_from_cpprb(
50 |                 transitions, device="cpu")
51 |             expert_replay_buffer.store(samples)
52 | 
53 |         replay_buffer = get_replay_buffer()
54 |         replay_buffer = GailWrapper(replay_buffer,
55 |                                     expert_replay_buffer,
56 |                                     discriminator)
57 |         set_replay_buffer(replay_buffer)
58 | 
59 |         # replace base_agent's replay_buffer with gail_buffer
60 |         base_agent.replay_buffer = replay_buffer
61 | 
62 |         return GAIL(
63 |             base_agent=base_agent,
64 |             minibatch_size=minibatch_size,
65 |             replay_start_size=replay_start_size,
66 |             update_frequency=update_frequency
67 |         )
68 |     return _gail
69 | 
70 | 
71 | __all__ = ["gail"]
72 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/noisy_td3.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim import Adam
 3 | from rlil.agents import NoisyTD3
 4 | from rlil.approximation import QContinuous, PolyakTarget
 5 | from rlil.policies import DeterministicPolicy
 6 | from rlil.memory import ExperienceReplayBuffer
 7 | from rlil.initializer import (get_device,
 8 |                               set_replay_buffer,
 9 |                               disable_on_policy_mode,
10 |                               set_n_step)
11 | from .models import fc_q, fc_deterministic_noisy_policy
12 | 
13 | 
14 | def noisy_td3(
15 |         # Common settings
16 |         discount_factor=0.99,
17 |         # Adam optimizer settings
18 |         lr_q=1e-3,
19 |         lr_pi=1e-3,
20 |         # Training settings
21 |         minibatch_size=512,
22 |         polyak_rate=0.005,
23 |         noise_td3=0.2,
24 |         policy_update_td3=2,
25 |         # Replay Buffer settings
26 |         replay_start_size=5000,
27 |         replay_buffer_size=1e7,
28 |         n_step=1
29 | ):
30 |     """
31 |     TD3 continuous control preset.
32 | 
33 |     Args:
34 |         discount_factor (float): Discount factor for future rewards.
35 |         lr_q (float): Learning rate for the Q network.
36 |         lr_pi (float): Learning rate for the policy network.
37 |         minibatch_size (int): Number of experiences to sample in each training update.
38 |         polyak_rate (float): Speed with which to update the target network towards the online network.
39 |         noise_td3 (float): the amount of noise to add to each action in trick three.
40 |         policy_update_td3 (int): Number of timesteps per training update the policy in trick two.
41 |         replay_start_size (int): Number of experiences in replay buffer when training begins.
42 |         replay_buffer_size (int): Maximum number of experiences to store in the replay buffer.
43 |         n_step (int): Number of steps for N step experience replay.
44 |     """
45 |     def _noisy_td3(env):
46 |         disable_on_policy_mode()
47 | 
48 |         device = get_device()
49 |         q_1_model = fc_q(env).to(device)
50 |         q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q)
51 |         q_1 = QContinuous(
52 |             q_1_model,
53 |             q_1_optimizer,
54 |             target=PolyakTarget(polyak_rate),
55 |             name='q_1'
56 |         )
57 | 
58 |         q_2_model = fc_q(env).to(device)
59 |         q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q)
60 |         q_2 = QContinuous(
61 |             q_2_model,
62 |             q_2_optimizer,
63 |             target=PolyakTarget(polyak_rate),
64 |             name='q_2'
65 |         )
66 | 
67 |         policy_model = fc_deterministic_noisy_policy(env).to(device)
68 |         policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
69 |         policy = DeterministicPolicy(
70 |             policy_model,
71 |             policy_optimizer,
72 |             env.action_space,
73 |             target=PolyakTarget(polyak_rate),
74 |         )
75 | 
76 |         set_n_step(n_step=n_step, discount_factor=discount_factor)
77 |         replay_buffer = ExperienceReplayBuffer(replay_buffer_size, env)
78 |         set_replay_buffer(replay_buffer)
79 | 
80 |         return NoisyTD3(
81 |             q_1,
82 |             q_2,
83 |             policy,
84 |             noise_td3=noise_td3,
85 |             policy_update_td3=policy_update_td3,
86 |             replay_start_size=replay_start_size,
87 |             discount_factor=discount_factor,
88 |             minibatch_size=minibatch_size
89 |         )
90 |     return _noisy_td3
91 | 
92 | 
93 | __all__ = ["noisy_td3"]
94 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/ppo.py:
--------------------------------------------------------------------------------
  1 | from torch.optim import Adam
  2 | from rlil.agents import PPO
  3 | from rlil.approximation import VNetwork, FeatureNetwork, Approximation
  4 | from rlil.policies import GaussianPolicy
  5 | from rlil.memory import ExperienceReplayBuffer, GaeWrapper
  6 | from rlil.initializer import (get_writer,
  7 |                               get_device,
  8 |                               set_replay_buffer,
  9 |                               enable_on_policy_mode)
 10 | from .models import fc_actor_critic
 11 | 
 12 | 
 13 | def ppo(
 14 |         # Common settings
 15 |         discount_factor=0.98,
 16 |         # Adam optimizer settings
 17 |         lr=3e-4,  # Adam learning rate
 18 |         eps=1e-5,  # Adam stability
 19 |         # Loss scaling
 20 |         entropy_loss_scaling=0.0,
 21 |         value_loss_scaling=0.5,
 22 |         # Replay Buffer settings
 23 |         replay_start_size=5000,
 24 |         # Training settings
 25 |         clip_grad=0.5,
 26 |         epsilon=0.2,
 27 |         minibatches=4,
 28 |         epochs=2,
 29 |         # GAE settings
 30 |         lam=0.95,
 31 | ):
 32 |     """
 33 |     PPO continuous control preset.
 34 | 
 35 |     Args:
 36 |         discount_factor (float): Discount factor for future rewards.
 37 |         lr (float): Learning rate for the Adam optimizer.
 38 |         eps (float): Stability parameters for the Adam optimizer.
 39 |         entropy_loss_scaling (float): 
 40 |             Coefficient for the entropy term in the total loss.
 41 |         value_loss_scaling (float): Coefficient for the value function loss.
 42 |         replay_start_size (int): Number of experiences in replay buffer when training begins.
 43 |         clip_grad (float): 
 44 |             The maximum magnitude of the gradient for any given parameter. 
 45 |             Set to 0 to disable.
 46 |         epsilon (float): 
 47 |             Epsilon value in the clipped PPO objective function.
 48 |         minibatches (int): The number of minibatches to split each batch into.
 49 |         lam (float): The Generalized Advantage Estimate (GAE) decay parameter.
 50 |     """
 51 |     def _ppo(env):
 52 |         enable_on_policy_mode()
 53 | 
 54 |         device = get_device()
 55 |         feature_model, value_model, policy_model = fc_actor_critic(env)
 56 |         feature_model.to(device)
 57 |         value_model.to(device)
 58 |         policy_model.to(device)
 59 | 
 60 |         feature_optimizer = Adam(
 61 |             feature_model.parameters(), lr=lr, eps=eps
 62 |         )
 63 |         value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
 64 |         policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)
 65 | 
 66 |         feature_nw = FeatureNetwork(
 67 |             feature_model,
 68 |             feature_optimizer,
 69 |             clip_grad=clip_grad,
 70 |         )
 71 |         v = VNetwork(
 72 |             value_model,
 73 |             value_optimizer,
 74 |             loss_scaling=value_loss_scaling,
 75 |             clip_grad=clip_grad,
 76 |         )
 77 |         policy = GaussianPolicy(
 78 |             policy_model,
 79 |             policy_optimizer,
 80 |             env.action_space,
 81 |             clip_grad=clip_grad,
 82 |         )
 83 | 
 84 |         replay_buffer = ExperienceReplayBuffer(1e7, env)
 85 |         replay_buffer = GaeWrapper(replay_buffer, discount_factor, lam)
 86 |         set_replay_buffer(replay_buffer)
 87 | 
 88 |         return PPO(
 89 |             feature_nw,
 90 |             v,
 91 |             policy,
 92 |             epsilon=epsilon,
 93 |             replay_start_size=replay_start_size,
 94 |             minibatches=minibatches,
 95 |             entropy_loss_scaling=entropy_loss_scaling,
 96 |         )
 97 | 
 98 |     return _ppo
 99 | 
100 | 
101 | __all__ = ["ppo"]
102 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/rs_mpc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim import Adam
 3 | from rlil.agents import RsMPC
 4 | from rlil.initializer import (get_device,
 5 |                               set_replay_buffer,
 6 |                               disable_on_policy_mode)
 7 | from rlil.approximation import Dynamics
 8 | from rlil.memory import ExperienceReplayBuffer
 9 | from rlil.environments import REWARDS
10 | from .models import fc_dynamics
11 | 
12 | 
13 | def rs_mpc(
14 |         horizon=20,
15 |         num_samples=1000,
16 |         # Adam optimizer settings
17 |         lr_dyn=1e-3,
18 |         # Training settings
19 |         minibatch_size=100,
20 |         # Replay Buffer settings
21 |         replay_start_size=5000,
22 |         replay_buffer_size=1e7,
23 | ):
24 |     """
25 |     Rnadom shooting MPC (RsMPC) control preset
26 | 
27 |     Args:
28 |         horizon (int): Control horizon.
29 |         num_samples (int): Number of action samples for random shooting.
30 |         lr_dyn (float): Learning rate for the dynamics network.
31 |         minibatch_size (int): Number of experiences to sample in each training update.
32 |         replay_buffer_size (int): Maximum number of experiences to store in the replay buffer.
33 |     """
34 |     def _rs_mpc(env):
35 |         assert env.name in REWARDS, \
36 |             "The reward function of {} is not registered in rlil.environments.reward_fns."
37 |         reward_fn = REWARDS[env.name]()
38 | 
39 |         disable_on_policy_mode()
40 |         device = get_device()
41 | 
42 |         dynamics_model = fc_dynamics(env).to(device)
43 |         dynamics_optimizer = Adam(dynamics_model.parameters(), lr=lr_dyn)
44 |         dynamics = Dynamics(
45 |             dynamics_model,
46 |             dynamics_optimizer,
47 |         )
48 | 
49 |         replay_buffer = ExperienceReplayBuffer(replay_buffer_size, env)
50 |         set_replay_buffer(replay_buffer)
51 | 
52 |         return RsMPC(
53 |             dynamics=dynamics,
54 |             reward_fn=reward_fn,
55 |             horizon=horizon,
56 |             num_samples=num_samples,
57 |             minibatch_size=minibatch_size,
58 |             replay_start_size=replay_start_size
59 |         )
60 |     return _rs_mpc
61 | 
62 | 
63 | __all__ = ["rs_mpc"]
64 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/sqil.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from rlil.initializer import set_replay_buffer, get_replay_buffer
 3 | from rlil.memory import ExperienceReplayBuffer, SqilWrapper
 4 | 
 5 | 
 6 | def sqil(
 7 |         transitions=None,
 8 |         base_agent_fn=None,
 9 |         # Replay Buffer settings
10 |         replay_start_size=5000,
11 |         replay_buffer_size=1e7
12 | ):
13 |     """
14 |     Soft Q Imitation Learning (SQIL) control preset
15 | 
16 |     Args:
17 |         transitions:
18 |             dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 
19 |         base_agent_fn (function):
20 |             A function generated by a preset of an agent such as sac, td3, ddpg
21 |         replay_start_size (int): Number of experiences in replay buffer when training begins.
22 |         replay_buffer_size (int): Maximum number of experiences to store in the replay buffer.
23 |     """
24 |     def _sqil(env):
25 |         base_agent = base_agent_fn(env)
26 |         expert_replay_buffer = ExperienceReplayBuffer(1e7, env)
27 |         if transitions is not None:
28 |             samples = expert_replay_buffer.samples_from_cpprb(
29 |                 transitions, device="cpu")
30 |             expert_replay_buffer.store(samples)
31 | 
32 |         replay_buffer = get_replay_buffer()
33 |         replay_buffer = SqilWrapper(replay_buffer,
34 |                                     expert_replay_buffer)
35 |         set_replay_buffer(replay_buffer)
36 |         # replace base_agent's replay_buffer with gail_buffer
37 |         base_agent.replay_buffer = replay_buffer
38 | 
39 |         return base_agent
40 |     return _sqil
41 | 
42 | 
43 | __all__ = ["gail"]
44 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/td3.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.optim import Adam
  3 | from rlil.agents import TD3
  4 | from rlil.approximation import QContinuous, PolyakTarget
  5 | from rlil.policies import DeterministicPolicy
  6 | from rlil.memory import ExperienceReplayBuffer
  7 | from rlil.initializer import (get_device,
  8 |                               set_replay_buffer,
  9 |                               disable_on_policy_mode,
 10 |                               set_n_step,
 11 |                               enable_apex)
 12 | from .models import fc_q, fc_deterministic_policy
 13 | 
 14 | 
 15 | def td3(
 16 |         # Common settings
 17 |         discount_factor=0.99,
 18 |         # Adam optimizer settings
 19 |         lr_q=1e-3,
 20 |         lr_pi=1e-3,
 21 |         # Training settings
 22 |         minibatch_size=512,
 23 |         polyak_rate=0.005,
 24 |         noise_td3=0.2,
 25 |         policy_update_td3=2,
 26 |         # Replay Buffer settings
 27 |         replay_start_size=5000,
 28 |         replay_buffer_size=1e7,
 29 |         prioritized=False,
 30 |         use_apex=False,
 31 |         n_step=1,
 32 |         # Exploration settings
 33 |         noise_policy=0.1,
 34 | ):
 35 |     """
 36 |     TD3 continuous control preset.
 37 | 
 38 |     Args:
 39 |         discount_factor (float): Discount factor for future rewards.
 40 |         lr_q (float): Learning rate for the Q network.
 41 |         lr_pi (float): Learning rate for the policy network.
 42 |         minibatch_size (int): Number of experiences to sample in each training update.
 43 |         polyak_rate (float): Speed with which to update the target network towards the online network.
 44 |         noise_td3 (float): the amount of noise to add to each action in trick three.
 45 |         policy_update_td3 (int): Number of timesteps per training update the policy in trick two.
 46 |         replay_start_size (int): Number of experiences in replay buffer when training begins.
 47 |         replay_buffer_size (int): Maximum number of experiences to store in the replay buffer.
 48 |         prioritized (bool): Use prioritized experience replay if True.
 49 |         use_apex (bool): Use apex if True.
 50 |         n_step (int): Number of steps for N step experience replay.
 51 |         noise_policy (float): The amount of exploration noise to add.
 52 |     """
 53 |     def _td3(env):
 54 |         disable_on_policy_mode()
 55 | 
 56 |         device = get_device()
 57 |         q_1_model = fc_q(env).to(device)
 58 |         q_1_optimizer = Adam(q_1_model.parameters(), lr=lr_q)
 59 |         q_1 = QContinuous(
 60 |             q_1_model,
 61 |             q_1_optimizer,
 62 |             target=PolyakTarget(polyak_rate),
 63 |             name='q_1'
 64 |         )
 65 | 
 66 |         q_2_model = fc_q(env).to(device)
 67 |         q_2_optimizer = Adam(q_2_model.parameters(), lr=lr_q)
 68 |         q_2 = QContinuous(
 69 |             q_2_model,
 70 |             q_2_optimizer,
 71 |             target=PolyakTarget(polyak_rate),
 72 |             name='q_2'
 73 |         )
 74 | 
 75 |         policy_model = fc_deterministic_policy(env).to(device)
 76 |         policy_optimizer = Adam(policy_model.parameters(), lr=lr_pi)
 77 |         policy = DeterministicPolicy(
 78 |             policy_model,
 79 |             policy_optimizer,
 80 |             env.action_space,
 81 |             target=PolyakTarget(polyak_rate),
 82 |         )
 83 | 
 84 |         if use_apex:
 85 |             enable_apex()
 86 |         set_n_step(n_step=n_step, discount_factor=discount_factor)
 87 |         replay_buffer = ExperienceReplayBuffer(
 88 |             replay_buffer_size, env,
 89 |             prioritized=prioritized or use_apex)
 90 |         set_replay_buffer(replay_buffer)
 91 | 
 92 |         return TD3(
 93 |             q_1,
 94 |             q_2,
 95 |             policy,
 96 |             noise_policy=noise_policy,
 97 |             noise_td3=noise_td3,
 98 |             policy_update_td3=policy_update_td3,
 99 |             replay_start_size=replay_start_size,
100 |             discount_factor=discount_factor,
101 |             minibatch_size=minibatch_size
102 |         )
103 |     return _td3
104 | 
105 | 
106 | __all__ = ["td3"]
107 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/vac.py:
--------------------------------------------------------------------------------
 1 | from torch.optim import Adam
 2 | from rlil.agents import VAC
 3 | from rlil.approximation import VNetwork, FeatureNetwork, Approximation
 4 | from rlil.policies import GaussianPolicy
 5 | from rlil.memory import ExperienceReplayBuffer
 6 | from rlil.initializer import (get_writer,
 7 |                               get_device,
 8 |                               set_replay_buffer,
 9 |                               enable_on_policy_mode)
10 | from .models import fc_actor_critic
11 | 
12 | 
13 | def vac(
14 |         # Common settings
15 |         discount_factor=0.98,
16 |         # Adam optimizer settings
17 |         lr=3e-4,  # Adam learning rate
18 |         eps=1e-5,  # Adam stability
19 |         # Loss scaling
20 |         value_loss_scaling=0.5,
21 |         # Replay Buffer settings
22 |         replay_start_size=500,
23 |         # Training settings
24 |         clip_grad=0.5,
25 | ):
26 |     """
27 |     VAC continuous control preset.
28 | 
29 |     Args:
30 |         discount_factor (float): Discount factor for future rewards.
31 |         lr (float): Learning rate for the Adam optimizer.
32 |         eps (float): Stability parameters for the Adam optimizer.
33 |         entropy_loss_scaling (float): 
34 |             Coefficient for the entropy term in the total loss.
35 |         value_loss_scaling (float): Coefficient for the value function loss.
36 |         replay_start_size (int): Number of experiences in replay buffer when training begins.
37 |         clip_grad (float): 
38 |             The maximum magnitude of the gradient for any given parameter. 
39 |             Set to 0 to disable.
40 |     """
41 |     def _vac(env):
42 |         enable_on_policy_mode()
43 | 
44 |         device = get_device()
45 |         feature_model, value_model, policy_model = fc_actor_critic(env)
46 |         feature_model.to(device)
47 |         value_model.to(device)
48 |         policy_model.to(device)
49 | 
50 |         feature_optimizer = Adam(
51 |             feature_model.parameters(), lr=lr, eps=eps
52 |         )
53 |         value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
54 |         policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)
55 | 
56 |         feature_nw = FeatureNetwork(
57 |             feature_model,
58 |             feature_optimizer,
59 |             clip_grad=clip_grad,
60 |         )
61 |         v = VNetwork(
62 |             value_model,
63 |             value_optimizer,
64 |             loss_scaling=value_loss_scaling,
65 |             clip_grad=clip_grad,
66 |         )
67 |         policy = GaussianPolicy(
68 |             policy_model,
69 |             policy_optimizer,
70 |             env.action_space,
71 |             clip_grad=clip_grad,
72 |         )
73 | 
74 |         replay_buffer = ExperienceReplayBuffer(1e7, env)
75 |         set_replay_buffer(replay_buffer)
76 | 
77 |         return VAC(
78 |             feature_nw,
79 |             v,
80 |             policy,
81 |             discount_factor=discount_factor,
82 |             replay_start_size=replay_start_size,
83 |         )
84 | 
85 |     return _vac
86 | 
87 | 
88 | __all__ = ["vac"]
89 | 


--------------------------------------------------------------------------------
/rlil/presets/continuous/vae_bc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim import Adam
 3 | from rlil.agents import VaeBC
 4 | from rlil.approximation import (BcqEncoder,
 5 |                                 BcqDecoder)
 6 | from rlil.memory import ExperienceReplayBuffer
 7 | from rlil.initializer import (get_device,
 8 |                               set_replay_buffer,
 9 |                               disable_on_policy_mode)
10 | from .models import (fc_bcq_encoder,
11 |                      fc_bcq_decoder)
12 | 
13 | 
14 | def vae_bc(
15 |         transitions=None,
16 |         # Adam optimizer settings
17 |         lr_enc=1e-3,
18 |         lr_dec=1e-3,
19 |         # Training settings
20 |         minibatch_size=100,
21 | ):
22 |     """
23 |     VAE Behavioral Cloning (VAE-BC) control preset
24 | 
25 |     Args:
26 |         transitions:
27 |             dictionary of transitions generated by cpprb.ReplayBuffer.get_all_transitions() 
28 |         lr_enc (float): Learning rate for the encoder.
29 |         lr_dec (float): Learning rate for the decoder.
30 |         minibatch_size (int): Number of experiences to sample in each training update.
31 |     """
32 |     def _vae_bc(env):
33 |         disable_on_policy_mode()
34 |         device = get_device()
35 | 
36 |         latent_dim = env.action_space.shape[0] * 2
37 | 
38 |         encoder_model = fc_bcq_encoder(env, latent_dim=latent_dim).to(device)
39 |         encoder_optimizer = Adam(encoder_model.parameters(), lr=lr_enc)
40 |         encoder = BcqEncoder(
41 |             model=encoder_model,
42 |             latent_dim=latent_dim,
43 |             optimizer=encoder_optimizer,
44 |             name="encoder",
45 |         )
46 |         decoder_model = fc_bcq_decoder(env, latent_dim=latent_dim).to(device)
47 |         decoder_optimizer = Adam(decoder_model.parameters(), lr=lr_dec)
48 |         decoder = BcqDecoder(
49 |             model=decoder_model,
50 |             latent_dim=latent_dim,
51 |             space=env.action_space,
52 |             optimizer=decoder_optimizer,
53 |             name="decoder",
54 |         )
55 | 
56 |         replay_buffer = ExperienceReplayBuffer(1e7, env)
57 |         if transitions is not None:
58 |             samples = replay_buffer.samples_from_cpprb(
59 |                 transitions, device="cpu")
60 |             replay_buffer.store(samples)
61 |         set_replay_buffer(replay_buffer)
62 | 
63 |         return VaeBC(
64 |             encoder=encoder,
65 |             decoder=decoder,
66 |             minibatch_size=minibatch_size,
67 |         )
68 |     return _vae_bc
69 | 
70 | 
71 | __all__ = ["vae_bc"]
72 | 


--------------------------------------------------------------------------------
/rlil/presets/validate_agent.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import ray
 4 | from rlil.environments import State
 5 | from rlil.initializer import is_on_policy_mode
 6 | from rlil.samplers import AsyncSampler
 7 | from rlil.experiments import Trainer
 8 | 
 9 | 
10 | def env_validation(agent_fn, env, done_step=-1):
11 |     """
12 |     Args:
13 |         agent_fn (func): presets of the agent
14 |         env (rlil.GymEnvironment) 
15 |         done_step (optional): 
16 |             Run until the step reaches done_step.
17 |             If less than 0, run until env.done == True.
18 |     """
19 | 
20 |     agent = agent_fn(env)
21 |     num_trains = 0
22 | 
23 |     for _ in range(2):
24 |         env.reset()
25 |         done_flag = False
26 |         step = 0
27 |         while not done_flag:
28 |             num_trains += agent.should_train()
29 |             if not is_on_policy_mode():
30 |                 agent.train()
31 |             env.step(agent.act(env.state, env.reward))
32 |             step += 1
33 |             if done_step < 0:
34 |                 done_flag = env.done
35 |             else:
36 |                 done_flag = done_step < step
37 |         num_trains += agent.should_train()
38 |         agent.train()
39 |         agent.act(env.state, env.reward)
40 | 
41 |     assert num_trains > 0
42 | 
43 | 
44 | def trainer_validation(agent_fn, env, apex=False):
45 |     agent = agent_fn(env)
46 |     lazy_agent = agent.make_lazy_agent()
47 |     eval_lazy_agent = agent.make_lazy_agent(evaluation=True)
48 |     lazy_agent.set_replay_buffer(env)
49 |     eval_lazy_agent.set_replay_buffer(env)
50 | 
51 |     env.reset()
52 |     action = lazy_agent.act(env.state, env.reward)
53 | 
54 |     while not env.done:
55 |         env.step(action)
56 |         action = lazy_agent.act(env.state, env.reward)
57 |         _ = eval_lazy_agent.act(env.state, env.reward)
58 | 
59 |     lazy_agent.replay_buffer.on_episode_end()
60 | 
61 |     samples = lazy_agent.replay_buffer.get_all_transitions()
62 |     samples.weights = lazy_agent.compute_priorities(samples)
63 |     if apex:
64 |         assert samples.weights is not None
65 |     agent.replay_buffer.store(samples)
66 |     agent.train()
67 |     agent.train()
68 |     assert agent.writer.train_steps > 1
69 | 


--------------------------------------------------------------------------------
/rlil/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from rlil.samplers.base import Sampler
2 | from rlil.samplers.asyncsampler import AsyncSampler, StartInfo
3 | 
4 | __all__ = ["Sampler", "AsyncSampler", "StartInfo"]
5 | 


--------------------------------------------------------------------------------
/rlil/samplers/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class Sampler(ABC):
 5 |     """
 6 |     Abstract sampler class
 7 |     """
 8 | 
 9 |     @abstractmethod
10 |     def start_sampling(self, agent, worker_frames, worker_episodes):
11 |         """
12 |         Start sampling until it reaches worker_frames or worker_episodes.
13 | 
14 |         Args:
15 |             agent (rlil.agent): Agent to collect samples
16 |             worker_frames (int): worker stops to sample when it collects worker_frames
17 |             worker_episodes (int): worker stops to sample when it reaches worker_episodes
18 |         """
19 | 
20 |     @abstractmethod
21 |     def store_samples(self):
22 |         """
23 |         Store collected samples to the replay_buffer
24 | 
25 |         Returns:
26 |             result (dict): Information of sampling (e.g. stored frames, returns, etc)
27 |         """
28 | 


--------------------------------------------------------------------------------
/rlil/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | class Samples:
 2 |     def __init__(self, states=None, actions=None, rewards=None,
 3 |                  next_states=None, weights=None, indexes=None):
 4 |         self.states = states
 5 |         self.actions = actions
 6 |         self.rewards = rewards
 7 |         self.next_states = next_states
 8 |         self.weights = weights
 9 |         self.indexes = indexes
10 |         self._keys = [self.states, self.actions, self.rewards,
11 |                       self.next_states, self.weights, self.indexes]
12 | 
13 |     def __iter__(self):
14 |         return iter(self._keys)
15 | 
16 | 
17 | def samples_to_np(samples):
18 |     np_states, np_dones = samples.states.raw_numpy()
19 |     np_actions = samples.actions.raw_numpy()
20 |     np_rewards = samples.rewards.detach().cpu().numpy()
21 |     np_next_states, np_next_dones = samples.next_states.raw_numpy()
22 |     return np_states, np_rewards, np_actions, np_next_states, \
23 |         np_dones, np_next_dones
24 | 


--------------------------------------------------------------------------------
/runs/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/continuous/offline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pybullet
 3 | import pybullet_envs
 4 | from rlil.environments import GymEnvironment, ENVS
 5 | from rlil.experiments import Experiment
 6 | from rlil.presets import get_default_args
 7 | from rlil.presets import continuous
 8 | from rlil.initializer import get_logger, set_device, set_seed, get_writer
 9 | import torch
10 | import logging
11 | import ray
12 | import pickle
13 | import os
14 | import shutil
15 | 
16 | 
17 | def main():
18 |     parser = argparse.ArgumentParser(
19 |         description="Run an offline continuous actions benchmark.")
20 |     parser.add_argument("env", help="Name of the env")
21 |     parser.add_argument("agent",
22 |                         help="Name of the agent (e.g. bc). See presets for available agents.")
23 |     parser.add_argument("dir",
24 |                         help="Directory where the transitions.pkl is saved.")
25 |     parser.add_argument("--device", default="cuda",
26 |                         help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)")
27 |     parser.add_argument("--seed", type=int, default=0,
28 |                         help="Random seed")
29 |     parser.add_argument("--train_minutes", type=int, default=30,
30 |                         help="Minutes to train.")
31 |     parser.add_argument("--num_workers_eval", type=int,
32 |                         default=1, help="Number of workers for evaluation")
33 |     parser.add_argument("--exp_info", default="default experiment",
34 |                         help="One line descriptions of the experiment. \
35 |                             Experiments' results are saved in 'runs/[exp_info]/[env_id]/'")
36 | 
37 |     args = parser.parse_args()
38 | 
39 |     # initialization
40 |     ray.init(include_webui=False, ignore_reinit_error=True)
41 |     set_device(torch.device(args.device))
42 |     set_seed(args.seed)
43 |     logger = get_logger()
44 |     logger.setLevel(logging.DEBUG)
45 | 
46 |     # set environment
47 |     if args.env in ENVS:
48 |         env_id = ENVS[args.env]
49 |     else:
50 |         env_id = args.env
51 |     env = GymEnvironment(env_id, append_time=True)
52 | 
53 |     # set agent
54 |     agent_name = args.agent
55 |     preset = getattr(continuous, agent_name)
56 |     with open(os.path.join(args.dir, "transitions.pkl"), mode='rb') as f:
57 |         transitions = pickle.load(f)
58 |     agent_fn = preset(transitions)
59 | 
60 |     # set args_dict
61 |     args_dict = get_default_args(preset)
62 |     args_dict.update(vars(args))
63 | 
64 |     Experiment(
65 |         agent_fn, env,
66 |         num_workers=0,
67 |         num_workers_eval=args.num_workers_eval,
68 |         train_minutes=args.train_minutes,
69 |         args_dict=args_dict,
70 |         seed=args.seed,
71 |         exp_info=args.exp_info,
72 |     )
73 | 
74 |     # copy demo_return.json if exists
75 |     demo_return_path = os.path.join(args.dir, "demo_return.json")
76 |     if os.path.exists(demo_return_path):
77 |         writer = get_writer()
78 |         shutil.copy2(demo_return_path, writer.log_dir)
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     main()
83 | 


--------------------------------------------------------------------------------
/scripts/continuous/online.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pybullet
 3 | import pybullet_envs
 4 | from rlil.environments import GymEnvironment, ENVS
 5 | from rlil.experiments import Experiment
 6 | from rlil.presets import get_default_args, continuous
 7 | from rlil.initializer import get_logger, set_device, set_seed
 8 | import torch
 9 | import logging
10 | import ray
11 | 
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser(
15 |         description="Run a continuous actions benchmark.")
16 |     parser.add_argument("env", help="Name of the env")
17 |     parser.add_argument("agent",
18 |                         help="Name of the agent (e.g. ppo). See presets for available agents.")
19 |     parser.add_argument("--device", default="cuda",
20 |                         help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)")
21 |     parser.add_argument("--seed", type=int, default=0,
22 |                         help="Random seed")
23 |     parser.add_argument("--train_minutes", type=int, default=60,
24 |                         help="Minutes to train.")
25 |     parser.add_argument("--num_workers", type=int, default=1,
26 |                         help="Number of workers for training")
27 |     parser.add_argument("--exp_info", default="default experiment",
28 |                         help="One line descriptions of the experiment. \
29 |                             Experiments' results are saved in 'runs/[exp_info]/[env_id]/'")
30 | 
31 |     args = parser.parse_args()
32 | 
33 |     # initialization
34 |     ray.init(include_webui=False, ignore_reinit_error=True)
35 |     set_device(torch.device(args.device))
36 |     set_seed(args.seed)
37 |     logger = get_logger()
38 |     logger.setLevel(logging.DEBUG)
39 | 
40 |     # set environment
41 |     if args.env in ENVS:
42 |         env_id = ENVS[args.env]
43 |     else:
44 |         env_id = args.env
45 |     env = GymEnvironment(env_id, append_time=True)
46 | 
47 |     # set agent
48 |     agent_name = args.agent
49 |     preset = getattr(continuous, agent_name)
50 |     buffer_args = {"n_step": 1, "prioritized": False, "use_apex": False}
51 |     agent_fn = preset(**buffer_args)
52 | 
53 |     # set args_dict
54 |     args_dict = get_default_args(preset)
55 |     args_dict.update(vars(args))
56 |     args_dict.update(buffer_args)
57 | 
58 |     Experiment(
59 |         agent_fn, env,
60 |         num_workers=args.num_workers,
61 |         train_minutes=args.train_minutes,
62 |         args_dict=args_dict,
63 |         seed=args.seed,
64 |         exp_info=args.exp_info,
65 |     )
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 


--------------------------------------------------------------------------------
/scripts/continuous/online_il.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pybullet
  3 | import pybullet_envs
  4 | from rlil.environments import GymEnvironment, ENVS
  5 | from rlil.experiments import Experiment
  6 | from rlil.presets import get_default_args
  7 | from rlil.presets import continuous
  8 | from rlil.initializer import get_logger, set_device, set_seed, get_writer
  9 | import torch
 10 | import logging
 11 | import ray
 12 | import pickle
 13 | import os
 14 | import shutil
 15 | 
 16 | 
 17 | def main():
 18 |     parser = argparse.ArgumentParser(
 19 |         description="Run an online_il benchmark.")
 20 |     parser.add_argument("env", help="Name of the env")
 21 |     parser.add_argument("agent",
 22 |                         help="Name of the online imitation learning agent \
 23 |                             (e.g. gail). See presets for available agents.")
 24 |     parser.add_argument("base_agent",
 25 |                         help="Name of the base agent (e.g. ddpg). \
 26 |                             See presets for available agents.")
 27 |     parser.add_argument("dir",
 28 |                         help="Directory where the transitions.pkl is saved.")
 29 |     parser.add_argument("--device", default="cuda",
 30 |                         help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)")
 31 |     parser.add_argument("--seed", type=int, default=0,
 32 |                         help="Random seed")
 33 |     parser.add_argument("--train_minutes", type=int, default=60,
 34 |                         help="Minutes to train.")
 35 |     parser.add_argument("--trains_per_episode", type=int, default=5,
 36 |                         help="Number of training steps per episode")
 37 |     parser.add_argument("--num_workers", type=int,
 38 |                         default=1, help="Number of workers for training")
 39 |     parser.add_argument("--exp_info", default="default experiment",
 40 |                         help="One line descriptions of the experiment. \
 41 |                             Experiments' results are saved in 'runs/[exp_info]/[env_id]/'")
 42 | 
 43 |     args = parser.parse_args()
 44 | 
 45 |     # initialization
 46 |     ray.init(include_webui=False, ignore_reinit_error=True)
 47 |     set_device(torch.device(args.device))
 48 |     set_seed(args.seed)
 49 |     logger = get_logger()
 50 |     logger.setLevel(logging.DEBUG)
 51 | 
 52 |     # set environment
 53 |     if args.env in ENVS:
 54 |         env_id = ENVS[args.env]
 55 |     else:
 56 |         env_id = args.env
 57 |     env = GymEnvironment(env_id, append_time=True)
 58 | 
 59 |     # set base_agent
 60 |     base_preset = getattr(continuous, args.base_agent)
 61 |     base_agent_fn = base_preset()
 62 | 
 63 |     # set agent
 64 |     with open(os.path.join(args.dir, "transitions.pkl"), mode='rb') as f:
 65 |         transitions = pickle.load(f)
 66 |     preset = getattr(continuous, args.agent)
 67 |     agent_fn = preset(
 68 |         transitions=transitions,
 69 |         base_agent_fn=base_agent_fn,
 70 |     )
 71 | 
 72 |     agent_name = agent_fn.__name__[1:]
 73 |     base_agent_name = base_agent_fn.__name__[1:]
 74 | 
 75 |     # set args_dict
 76 |     args_dict = {"args": {}, base_agent_name: {}, agent_name: {}}
 77 |     args_dict["args"] = vars(args)
 78 |     args_dict[base_agent_name] = get_default_args(base_preset)
 79 |     args_dict[agent_name] = get_default_args(preset)
 80 | 
 81 |     Experiment(
 82 |         agent_fn, env,
 83 |         agent_name=agent_name + "-" + base_agent_name,
 84 |         num_workers=args.num_workers,
 85 |         train_minutes=args.train_minutes,
 86 |         trains_per_episode=args.trains_per_episode,
 87 |         args_dict=args_dict,
 88 |         seed=args.seed,
 89 |         exp_info=args.exp_info,
 90 |     )
 91 | 
 92 |     # copy demo_return.json if exists
 93 |     demo_return_path = os.path.join(args.dir, "demo_return.json")
 94 |     if os.path.exists(demo_return_path):
 95 |         writer = get_writer()
 96 |         shutil.copy2(demo_return_path, writer.log_dir)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main()
101 | 


--------------------------------------------------------------------------------
/scripts/continuous/watch_continuous.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pybullet
 3 | import pybullet_envs
 4 | import re
 5 | import os
 6 | import time
 7 | from rlil.environments import GymEnvironment, ENVS
 8 | from rlil.presets import continuous
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser(description="Watch a continuous agent.")
13 |     parser.add_argument("env", help="Name of the env")
14 |     parser.add_argument("agent",
15 |                         help="Name of the agent (e.g. ppo). See presets for available agents.")
16 |     parser.add_argument("--train", action="store_true",
17 |                         help="The model of lazy_agent: evaluation or training.")
18 |     parser.add_argument(
19 |         "dir", help="Directory where the agent's model was saved.")
20 |     parser.add_argument(
21 |         "--device",
22 |         default="cpu",
23 |         help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)",
24 |     )
25 |     parser.add_argument(
26 |         "--fps",
27 |         default=120,
28 |         help="Playback speed",
29 |     )
30 |     args = parser.parse_args()
31 | 
32 |     # load env
33 |     env = GymEnvironment(ENVS[args.env], append_time=True)
34 | 
35 |     # load agent
36 |     agent_fn = getattr(continuous, args.agent)()
37 |     agent = agent_fn(env)
38 |     agent.load(args.dir)
39 | 
40 |     # watch
41 |     watch(agent, env, fps=args.fps, eval=not args.train)
42 | 
43 | 
44 | def watch(agent, env, fps=60, eval=True):
45 |     action = None
46 |     returns = 0
47 |     # have to call this before initial reset for pybullet envs
48 |     if "Bullet" in env.name:
49 |         env.render(mode="human")
50 |     while True:
51 |         time.sleep(1 / fps)
52 |         if env.done:
53 |             lazy_agent = agent.make_lazy_agent(evaluation=eval)
54 |             lazy_agent.set_replay_buffer(env)
55 |             print('returns: {}'.format(returns))
56 |             env.reset()
57 |             returns = 0
58 |         else:
59 |             env.step(action)
60 |         env.render()
61 |         action = lazy_agent.act(env.state, env.reward)
62 |         returns += env.reward
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     main()
67 | 


--------------------------------------------------------------------------------
/scripts/offline_continuous.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export exp_info=offline
 4 | export train_minutes=120
 5 | 
 6 | for agent in bc bcq
 7 |     do
 8 |     for seed in {0..3}
 9 |         do
10 |         # ant
11 |         tsp python ~/pytorch-rl-il/scripts/continuous/offline.py ant $agent runs/demos/AntBulletEnv-v0/td3_3000 --train_minutes $train_minutes --exp_info $exp_info --device cuda:0 --seed $seed 
12 |         # hopper
13 |         tsp python ~/pytorch-rl-il/scripts/continuous/offline.py hopper $agent runs/demos/HopperBulletEnv-v0/sac_2000 --train_minutes $train_minutes --exp_info $exp_info --device cuda:0 --seed $seed 
14 |         # humanoid
15 |         tsp python ~/pytorch-rl-il/scripts/continuous/offline.py humanoid $agent runs/demos/HumanoidBulletEnv-v0/td3_1700 --train_minutes $train_minutes --exp_info $exp_info --device cuda:0 --seed $seed 
16 |         # walker
17 |         tsp python ~/pytorch-rl-il/scripts/continuous/offline.py walker $agent runs/demos/WalkerBulletEnv-v0/ppo_3000 --train_minutes $train_minutes --exp_info $exp_info --device cuda:0 --seed $seed 
18 |     done
19 | done


--------------------------------------------------------------------------------
/scripts/online_continuous.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export exp_info=online
 4 | export train_minutes=120
 5 | export num_workers=8
 6 | 
 7 | for env in ant humanoid walker lander
 8 |     do
 9 |     for agent in ppo ddpg td3 sac
10 |         do
11 |         tsp python ~/pytorch-rl-il/scripts/continuous/online.py $env $agent --train_minutes $train_minutes --num_workers $num_workers --exp_info $exp_info --device cuda:0 --seed 0
12 |         tsp python ~/pytorch-rl-il/scripts/continuous/online.py $env $agent --train_minutes $train_minutes --num_workers $num_workers --exp_info $exp_info --device cuda:0 --seed 1
13 |         tsp python ~/pytorch-rl-il/scripts/continuous/online.py $env $agent --train_minutes $train_minutes --num_workers $num_workers --exp_info $exp_info --device cuda:1 --seed 2
14 |         tsp python ~/pytorch-rl-il/scripts/continuous/online.py $env $agent --train_minutes $train_minutes --num_workers $num_workers --exp_info $exp_info --device cuda:1 --seed 3
15 |     done
16 | done
17 | 


--------------------------------------------------------------------------------
/scripts/online_il_continuous.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export exp_info=online_il
 4 | export train_minutes=120
 5 | export trains_per_episode=5
 6 | export num_workers=8
 7 | 
 8 | for agent in gail sqil 
 9 |     do
10 |     for base_agent in ppo sac
11 |         do
12 |         for seed in {0..3}
13 |             do
14 |             # ant
15 |             tsp python ~/pytorch-rl-il/scripts/continuous/online_il.py ant $agent $base_agent runs/demos/AntBulletEnv-v0/td3_3000 --train_minutes $train_minutes --exp_info $exp_info --device cuda:0 --seed $seed 
16 |             # hopper
17 |             tsp python ~/pytorch-rl-il/scripts/continuous/online_il.py hopper $agent $base_agent runs/demos/HopperBulletEnv-v0/sac_2000 --train_minutes $train_minutes --exp_info $exp_info --device cuda:0 --seed $seed 
18 |             # humanoid
19 |             tsp python ~/pytorch-rl-il/scripts/continuous/online_il.py humanoid $agent $base_agent runs/demos/HumanoidBulletEnv-v0/td3_1700 --train_minutes $train_minutes --exp_info $exp_info --device cuda:1 --seed $seed 
20 |             # walker
21 |             tsp python ~/pytorch-rl-il/scripts/continuous/online_il.py walker $agent $base_agent runs/demos/Walker2DBulletEnv-v0/ppo_3000 --train_minutes $train_minutes --exp_info $exp_info --device cuda:1 --seed $seed 
22 |         done
23 |     done
24 | done


--------------------------------------------------------------------------------
/scripts/plot.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from rlil.utils.plots import plot
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = argparse.ArgumentParser(
 7 |         description="Plots the results of experiments.")
 8 |     parser.add_argument("dir",
 9 |                         help="Experiment directory. This is a directory of exp_info, not runs/")
10 |     parser.add_argument("--step", type=str, default="train_steps",
11 |                         help="The unit of x-axis. You can choose it from \
12 |                             [sample_frames, sample_episodes, train_steps, minutes]")
13 | 
14 |     args = parser.parse_args()
15 | 
16 |     plot(args.dir, args.step)
17 | 


--------------------------------------------------------------------------------
/scripts/record_trajectory.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pybullet
 3 | import pybullet_envs
 4 | import os
 5 | import time
 6 | import pickle
 7 | import json
 8 | import numpy as np
 9 | import ray
10 | from rlil.memory import ExperienceReplayBuffer
11 | from rlil.initializer import set_replay_buffer, get_replay_buffer
12 | from rlil.samplers import AsyncSampler
13 | from rlil.environments import GymEnvironment
14 | from rlil.presets import continuous
15 | 
16 | 
17 | def main():
18 |     parser = argparse.ArgumentParser(description="Record a trajectory of trained agent. \
19 |         The trajectory will be stored as transitions.pkl in the args.dir.")
20 |     parser.add_argument(
21 |         "dir", help="Directory where the agent's model is saved.")
22 |     parser.add_argument(
23 |         "--device",
24 |         default="cpu",
25 |         help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)",
26 |     )
27 |     parser.add_argument("--train", action="store_true",
28 |                         help="The model of lazy_agent: evaluation or training.")
29 |     parser.add_argument("--num_workers", type=int, default=1,
30 |                         help="Number of workers for training")
31 |     parser.add_argument("--frames", type=int, default=1e6,
32 |                         help="Number of frames to store")
33 | 
34 |     args = parser.parse_args()
35 |     ray.init(include_webui=False, ignore_reinit_error=True)
36 | 
37 |     # load env
38 |     if args.dir[-1] != "/":
39 |         args.dir += "/"
40 |     env_id = args.dir.split("/")[-3]
41 |     env = GymEnvironment(env_id, append_time=True)
42 | 
43 |     # load agent
44 |     agent_name = os.path.basename(
45 |         os.path.dirname(args.dir)).split("_")[0]
46 |     agent_fn = getattr(continuous, agent_name)()
47 |     agent = agent_fn(env)
48 |     agent.load(args.dir)
49 |     lazy_agent = agent.make_lazy_agent(
50 |         evaluation=not args.train, store_samples=True)
51 | 
52 |     # reset ExperienceReplayBuffer
53 |     set_replay_buffer(ExperienceReplayBuffer(args.frames + 10, env))
54 | 
55 |     # set sampler
56 |     sampler = AsyncSampler(env, num_workers=args.num_workers)
57 | 
58 |     # start recording
59 |     replay_buffer = get_replay_buffer()
60 | 
61 |     returns = []
62 |     while len(replay_buffer) < args.frames:
63 |         sampler.start_sampling(
64 |             lazy_agent, worker_episodes=1)
65 | 
66 |         sample_result = sampler.store_samples(timeout=1)
67 |         for sample_info in sample_result.values():
68 |             returns += sample_info["returns"]
69 | 
70 |     # save return info of the policy
71 |     returns_dict = {"mean": np.mean(returns), "std": np.std(returns)}
72 |     filepath = os.path.join(args.dir, 'demo_return.json')
73 |     with open(filepath, mode='w') as f:
74 |         json.dump(returns_dict, f)
75 | 
76 |     # save replay buffer
77 |     filepath = os.path.join(args.dir, 'transitions.pkl')
78 |     with open(filepath, mode='wb') as f:
79 |         samples = replay_buffer.get_all_transitions(return_cpprb=True)
80 |         pickle.dump(samples, f)
81 | 
82 |     print("Transitions (size: {}) is saved at {}".format(
83 |         len(replay_buffer), filepath))
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     main()
88 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="pytorch-rl-il",
 5 |     version="0.0.1",
 6 |     description=(
 7 |         "A library for building reinforcement learning and imitation learning agents in Pytorch"),
 8 |     packages=find_packages(),
 9 |     url="https://github.com/syuntoku14/pytorch-rl-il",
10 |     author="Toshinori Kitamura",
11 |     author_email="syuntoku14@gmail.com",
12 |     install_requires=[
13 |         "gym[atari,box2d]",    # atari environments
14 |         "numpy",         # math library
15 |         "matplotlib",    # plotting library
16 |         "seaborn",    # plotting library
17 |         "pandas",
18 |         "opencv-python",  # used by atari wrappers
19 |         "pybullet",      # continuous environments
20 |         "autopep8",      # code quality tool
21 |         "torch-testing",  # testing library for pytorch
22 |         "ray",  # multiprocessing tool
23 |         "pytest",  # python testing library
24 |         "cpprb",  # fast replay buffer library
25 |         "pytest-benchmark",
26 |         "gitpython"
27 |         # these should be installed globally:
28 |         # "tensorflow",  # needed for tensorboard
29 |         # "torch",       # deep learning library
30 |         # "torchvision", # install alongside pytorch
31 |     ],
32 |     extras_require={
33 |         "pytorch": [
34 |             "torch",
35 |             "torchvision",
36 |             "tensorboard"
37 |         ],
38 |         "docs": [
39 |             "sphinx",
40 |             "sphinx-autobuild",
41 |             "sphinx-rtd-theme",
42 |             "sphinx-automodapi"
43 |         ]
44 |     },
45 | )
46 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/__init__.py


--------------------------------------------------------------------------------
/tests/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/agents/__init__.py


--------------------------------------------------------------------------------
/tests/approximation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/approximation/__init__.py


--------------------------------------------------------------------------------
/tests/approximation/bcq_encoder_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch_testing as tt
 4 | from torch.nn.functional import mse_loss
 5 | from rlil import nn
 6 | from rlil.approximation.bcq_auto_encoder import BcqEncoder, BcqDecoder
 7 | from rlil.environments import State, Action, GymEnvironment
 8 | from rlil.presets.continuous.models import fc_bcq_encoder, fc_bcq_decoder
 9 | import numpy as np
10 | 
11 | 
12 | # Test the network architecture of
13 | # https://github.com/sfujim/BCQ/blob/05c07fc442a2be96f6249b966682cf065045500f/BCQ.py
14 | @pytest.fixture
15 | def setUp():
16 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
17 |     Action.set_action_space(env.action_space)
18 |     latent_dim = 32
19 |     num_samples = 5
20 |     encoder_model = fc_bcq_encoder(env, latent_dim=latent_dim)
21 |     decoder_model = fc_bcq_decoder(env, latent_dim=latent_dim)
22 | 
23 |     encoder_optimizer = torch.optim.SGD(encoder_model.parameters(), lr=0.1)
24 |     decoder_optimizer = torch.optim.SGD(decoder_model.parameters(), lr=0.1)
25 |     encoder = BcqEncoder(model=encoder_model,
26 |                          latent_dim=latent_dim,
27 |                          optimizer=encoder_optimizer)
28 |     decoder = BcqDecoder(model=decoder_model,
29 |                          latent_dim=latent_dim,
30 |                          space=env.action_space,
31 |                          optimizer=decoder_optimizer)
32 |     sample_states = State.from_list([env.reset() for _ in range(num_samples)])
33 |     sample_actions = Action(
34 |         torch.tensor([env.action_space.sample() for _ in range(num_samples)]))
35 | 
36 |     yield encoder, decoder, sample_states, sample_actions
37 | 
38 | 
39 | def test_decode(setUp):
40 |     encoder, decoder, states, actions = setUp
41 |     mean, log_var = encoder(states, actions)
42 |     z = mean + (0.5 * log_var).exp() * torch.randn_like(log_var)
43 |     dec = decoder(states, z)
44 |     assert actions.shape == dec.shape
45 | 
46 | 
47 | def test_decode_multiple(setUp):
48 |     encoder, decoder, states, actions = setUp
49 |     dec = decoder.decode_multiple(states, 10)
50 |     assert (actions.shape[0], 10, actions.shape[-1]) == dec[0].shape
51 | 
52 | 
53 | def test_reinforce(setUp):
54 |     encoder, decoder, states, actions = setUp
55 |     mean, log_var = encoder(states, actions)
56 |     # reinforce mse
57 |     z = mean + (0.5 * log_var).exp() * torch.randn_like(log_var)
58 |     dec = decoder(states, z)
59 |     loss = mse_loss(actions.features, dec)
60 | 
61 |     for _ in range(100):
62 |         mean, log_var = encoder(states, actions)
63 |         z = mean + log_var.exp() * torch.randn_like(log_var)
64 |         dec = decoder(states, z)
65 |         new_loss = mse_loss(actions.features, dec)
66 |         decoder.reinforce(new_loss)
67 |         encoder.reinforce()
68 |     assert new_loss < loss
69 | 
70 |     # reinforce mse
71 |     z = mean + (0.5 * log_var).exp() * torch.randn_like(log_var)
72 |     dec = decoder(states, z)
73 |     loss = nn.kl_loss_vae(mean, log_var)
74 | 
75 |     for _ in range(10):
76 |         mean, log_var = encoder(states, actions)
77 |         z = mean + log_var.exp() * torch.randn_like(log_var)
78 |         dec = decoder(states, z)
79 |         new_loss = nn.kl_loss_vae(mean, log_var)
80 |         decoder.reinforce(new_loss)
81 |         encoder.reinforce()
82 |     assert new_loss < loss
83 | 


--------------------------------------------------------------------------------
/tests/approximation/dynamics_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch_testing as tt
 4 | from torch.nn.functional import mse_loss
 5 | from rlil import nn
 6 | from rlil.approximation import Dynamics
 7 | from rlil.environments import State, Action, GymEnvironment
 8 | from rlil.presets.continuous.models import fc_dynamics
 9 | 
10 | 
11 | @pytest.fixture
12 | def setUp():
13 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
14 |     dynamics_model = fc_dynamics(env)
15 | 
16 |     dynamics_optimizer = torch.optim.Adam(dynamics_model.parameters())
17 |     dynamics = Dynamics(model=dynamics_model,
18 |                         optimizer=dynamics_optimizer)
19 | 
20 |     yield env, dynamics
21 | 
22 | 
23 | def test_forward(setUp):
24 |     env, dynamics = setUp
25 |     state = env.reset()
26 |     for _ in range(10):
27 |         action = Action(
28 |             torch.tensor(env.action_space.sample()).unsqueeze(0))
29 |         output = dynamics(state, action)
30 |         assert state.shape == output.shape
31 |         tt.assert_equal(state.mask, output.mask)
32 | 
33 | 
34 | def test_reinforce(setUp):
35 |     env, dynamics = setUp
36 |     state = env.reset()
37 |     action = Action(
38 |         torch.tensor(env.action_space.sample()).unsqueeze(0))
39 |     output = dynamics(state, action)
40 |     loss = mse_loss(state.features, output.features)
41 |     for _ in range(10):
42 |         output = dynamics(state, action)
43 |         new_loss = mse_loss(state.features, output.features)
44 |         dynamics.reinforce(new_loss)
45 |     assert new_loss < loss
46 | 


--------------------------------------------------------------------------------
/tests/approximation/ensemble_q_continuous_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch_testing as tt
 4 | from torch.nn.functional import mse_loss
 5 | from rlil import nn
 6 | from rlil.approximation.ensemble_q_continuous import EnsembleQContinuous
 7 | from rlil.environments import State, Action, GymEnvironment
 8 | from rlil.presets.continuous.models import fc_q
 9 | import numpy as np
10 | 
11 | 
12 | @pytest.fixture
13 | def setUp():
14 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
15 |     num_qs = 3
16 |     num_samples = 5
17 |     q_models = nn.ModuleList([fc_q(env) for _ in range(num_qs)])
18 |     qs_optimizer = torch.optim.Adam(q_models.parameters())
19 |     qs = EnsembleQContinuous(q_models, qs_optimizer)
20 |     Action.set_action_space(env.action_space)
21 |     sample_states = State.from_list([env.reset() for _ in range(num_samples)])
22 |     sample_actions = Action(
23 |         torch.tensor([env.action_space.sample() for _ in range(num_samples)]))
24 | 
25 |     yield qs, sample_states, sample_actions
26 | 
27 | 
28 | def test_forward(setUp):
29 |     qs, states, actions = setUp
30 |     q_values = qs(states, actions)
31 |     assert q_values.shape == (5, 3)
32 |     with pytest.raises(AssertionError):
33 |         tt.assert_almost_equal(q_values[0][0], q_values[0][1])
34 |         tt.assert_almost_equal(q_values[0][0], q_values[0][2])
35 | 
36 | 
37 | def test_q1(setUp):
38 |     qs, states, actions = setUp
39 |     q_values = qs.q1(states, actions)
40 |     assert q_values.shape == (5, )
41 | 
42 | 
43 | def test_reinforce(setUp):
44 |     qs, states, actions = setUp
45 |     q_values = qs(states, actions)
46 |     qs_params = [param.data.clone() for param in qs.model.parameters()]
47 |     qs.reinforce(q_values.sum())
48 |     new_qs_params = [param.data for param in qs.model.parameters()]
49 | 
50 |     for param, new_param in zip(qs_params, new_qs_params):
51 |         with pytest.raises(AssertionError):
52 |             tt.assert_almost_equal(param, new_param)
53 | 


--------------------------------------------------------------------------------
/tests/approximation/feature_network_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from torch import nn
 4 | import torch_testing as tt
 5 | from torch.optim import Adam
 6 | from rlil.environments import State, GymEnvironment
 7 | from rlil.presets.continuous.models import fc_actor_critic
 8 | from rlil.approximation import FeatureNetwork, VNetwork
 9 | from rlil.policies.gaussian import GaussianPolicy
10 | 
11 | 
12 | STATE_DIM = 2
13 | 
14 | 
15 | @pytest.fixture
16 | def setUp():
17 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
18 | 
19 |     feature_model, value_model, policy_model = fc_actor_critic(env)
20 |     value_optimizer = Adam(value_model.parameters())
21 |     policy_optimizer = Adam(policy_model.parameters())
22 |     feature_optimizer = Adam(feature_model.parameters())
23 | 
24 |     feature_nw = FeatureNetwork(feature_model, feature_optimizer)
25 |     v = VNetwork(value_model, value_optimizer)
26 |     policy = GaussianPolicy(policy_model, policy_optimizer, env.action_space)
27 | 
28 |     states = env.reset()
29 |     yield states, feature_nw, v, policy
30 | 
31 | 
32 | def test_share_output(setUp):
33 |     states, feature_nw, v, policy = setUp
34 | 
35 |     states = feature_nw(states)
36 |     value = v(states)
37 |     action = policy(states).sample()
38 | 
39 |     value_loss = value.sum()
40 |     policy_loss = policy(states).log_prob(action+1).sum()
41 | 
42 |     policy.reinforce(policy_loss)
43 |     v.reinforce(value_loss)
44 |     feature_nw.reinforce()
45 | 
46 | 
47 | def test_independent_output(setUp):
48 |     states, feature_nw, v, policy = setUp
49 | 
50 |     v_states = feature_nw(states)
51 |     p_states = feature_nw(states)
52 |     value = v(v_states)
53 |     action = policy(p_states).sample()
54 | 
55 |     value_loss = value.sum()
56 |     policy_loss = policy(p_states).log_prob(action+1).sum()
57 | 
58 |     policy.reinforce(policy_loss)
59 |     v.reinforce(value_loss)
60 |     feature_nw.reinforce()
61 | 


--------------------------------------------------------------------------------
/tests/approximation/q_network_test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import torch
  3 | import gym
  4 | from torch import nn
  5 | from torch.nn.functional import smooth_l1_loss
  6 | import torch_testing as tt
  7 | import numpy as np
  8 | from rlil.environments import State, Action
  9 | from rlil.approximation import QNetwork, FixedTarget
 10 | 
 11 | STATE_DIM = 2
 12 | ACTIONS = 3
 13 | action_space = gym.spaces.Discrete(10)
 14 | 
 15 | 
 16 | class TestQNetwork(unittest.TestCase):
 17 |     def setUp(self):
 18 |         torch.manual_seed(2)
 19 |         self.model = nn.Sequential(
 20 |             nn.Linear(STATE_DIM, ACTIONS)
 21 |         )
 22 | 
 23 |         def optimizer(params):
 24 |             return torch.optim.SGD(params, lr=0.1)
 25 |         self.q = QNetwork(self.model, optimizer)
 26 | 
 27 |     def test_eval_list(self):
 28 |         states = State(
 29 |             torch.randn(5, STATE_DIM),
 30 |             mask=torch.tensor([1, 1, 0, 1, 0])
 31 |         )
 32 |         result = self.q.eval(states)
 33 |         tt.assert_almost_equal(
 34 |             result,
 35 |             torch.tensor([
 36 |                 [-0.238509, -0.726287, -0.034026],
 37 |                 [-0.35688755, -0.6612102, 0.34849477],
 38 |                 [0., 0., 0.],
 39 |                 [0.1944, -0.5536, -0.2345],
 40 |                 [0., 0., 0.]
 41 |             ]),
 42 |             decimal=2
 43 |         )
 44 | 
 45 |     def test_eval_actions(self):
 46 |         states = State(torch.randn(3, STATE_DIM))
 47 |         Action.set_action_space(action_space)
 48 |         actions = Action(torch.tensor([1, 2, 0]).unsqueeze(1))
 49 |         result = self.q.eval(states, actions)
 50 |         self.assertEqual(result.shape, torch.Size([3]))
 51 |         tt.assert_almost_equal(result, torch.tensor(
 52 |             [-0.7262873, 0.3484948, -0.0296164]))
 53 | 
 54 |     def test_target_net(self):
 55 |         torch.manual_seed(2)
 56 |         model = nn.Sequential(
 57 |             nn.Linear(1, 1)
 58 |         )
 59 |         optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
 60 |         q = QNetwork(
 61 |             model,
 62 |             optimizer,
 63 |             target=FixedTarget(3)
 64 |         )
 65 |         inputs = State(torch.tensor([1.]).unsqueeze(0))
 66 | 
 67 |         def loss(policy_value):
 68 |             target = policy_value - 1
 69 |             return smooth_l1_loss(policy_value, target.detach())
 70 | 
 71 |         policy_value = q(inputs)
 72 |         target_value = q.target(inputs).item()
 73 |         np.testing.assert_equal(policy_value.item(), -0.008584141731262207)
 74 |         np.testing.assert_equal(target_value, -0.008584141731262207)
 75 | 
 76 |         q.reinforce(loss(policy_value))
 77 |         policy_value = q(inputs)
 78 |         target_value = q.target(inputs).item()
 79 |         np.testing.assert_equal(policy_value.item(), -0.20858412981033325)
 80 |         np.testing.assert_equal(target_value, -0.008584141731262207)
 81 | 
 82 |         q.reinforce(loss(policy_value))
 83 |         policy_value = q(inputs)
 84 |         target_value = q.target(inputs).item()
 85 |         np.testing.assert_equal(policy_value.item(), -0.4085841178894043)
 86 |         np.testing.assert_equal(target_value, -0.008584141731262207)
 87 | 
 88 |         q.reinforce(loss(policy_value))
 89 |         policy_value = q(inputs)
 90 |         target_value = q.target(inputs).item()
 91 |         np.testing.assert_equal(policy_value.item(), -0.6085841655731201)
 92 |         np.testing.assert_equal(target_value, -0.6085841655731201)
 93 | 
 94 |         q.reinforce(loss(policy_value))
 95 |         policy_value = q(inputs)
 96 |         target_value = q.target(inputs).item()
 97 |         np.testing.assert_equal(policy_value.item(), -0.8085841536521912)
 98 |         np.testing.assert_equal(target_value, -0.6085841655731201)
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     unittest.main()
103 | 


--------------------------------------------------------------------------------
/tests/approximation/v_network_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import torch
 3 | from torch import nn
 4 | import torch_testing as tt
 5 | from rlil.approximation.v_network import VNetwork
 6 | from rlil.environments import State
 7 | 
 8 | STATE_DIM = 2
 9 | 
10 | 
11 | def loss(value, error):
12 |     target = value + error
13 |     return ((target.detach() - value) ** 2).mean()
14 | 
15 | 
16 | class TestVNetwork(unittest.TestCase):
17 |     def setUp(self):
18 |         torch.manual_seed(2)
19 |         self.model = nn.Sequential(
20 |             nn.Linear(STATE_DIM, 1)
21 |         )
22 | 
23 |         optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1)
24 |         self.v = VNetwork(self.model, optimizer)
25 | 
26 |     def test_reinforce_list(self):
27 |         states = State(
28 |             torch.randn(5, STATE_DIM),
29 |             mask=torch.tensor([1, 1, 0, 1, 0])
30 |         )
31 |         result = self.v(states)
32 |         tt.assert_almost_equal(result, torch.tensor(
33 |             [0.7053187, 0.3975691, 0., 0.2701665, 0.]))
34 | 
35 |         self.v.reinforce(loss(result, torch.tensor([1, -1, 1, 1, 1])).float())
36 |         result = self.v(states)
37 |         tt.assert_almost_equal(result, torch.tensor(
38 |             [0.9732854, 0.5453826, 0., 0.4344811, 0.]))
39 | 
40 |     def test_multi_reinforce(self):
41 |         states = State(
42 |             torch.randn(5, STATE_DIM),
43 |             mask=torch.tensor([1, 1, 0, 1, 0, 0])
44 |         )
45 |         result1 = self.v(states[0:2])
46 |         self.v.reinforce(loss(result1, torch.tensor([1, 2])).float())
47 |         result2 = self.v(states[2:4])
48 |         self.v.reinforce(loss(result2, torch.tensor([1, 1])).float())
49 |         result3 = self.v(states[4:6])
50 |         self.v.reinforce(loss(result3, torch.tensor([1, 2])).float())
51 |         with self.assertRaises(Exception):
52 |             self.v.reinforce(loss(result3, torch.tensor([1, 2])).float())
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     unittest.main()
57 | 


--------------------------------------------------------------------------------
/tests/benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/benchmark/__init__.py


--------------------------------------------------------------------------------
/tests/benchmark/action_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import gym
 3 | import torch
 4 | import numpy as np
 5 | from rlil.environments import Action, State
 6 | import rlil.initializer as init
 7 | 
 8 | 
 9 | @pytest.fixture()
10 | def set_action_space():
11 |     action_space = gym.spaces.Box(
12 |         low=np.array([-1, -10]), high=np.array([1, 10]))
13 |     Action.set_action_space(action_space)
14 | 
15 |     raw = torch.tensor([[0, 0], [2, 2], [-20, -20]], dtype=torch.float32)
16 |     yield raw
17 | 
18 | 
19 | def test_create_action_debug(set_action_space,
20 |                              benchmark):
21 |     init.enable_debug_mode()
22 |     assert init.is_debug_mode()
23 | 
24 |     raw = set_action_space
25 |     action = benchmark.pedantic(Action,
26 |                                 kwargs={'raw': raw},
27 |                                 rounds=100,
28 |                                 iterations=5)
29 | 
30 | 
31 | def test_create_action(set_action_space,
32 |                        benchmark):
33 |     init.disable_debug_mode()
34 |     assert not init.is_debug_mode()
35 | 
36 |     raw = set_action_space
37 |     action = benchmark.pedantic(Action,
38 |                                 kwargs={'raw': raw},
39 |                                 rounds=100,
40 |                                 iterations=5)
41 | 
42 | 
43 | def get_features(action):
44 |     return action.features
45 | 
46 | 
47 | def test_features_action_cpu(set_action_space,
48 |                              benchmark):
49 |     raw = set_action_space
50 |     action = Action(raw)
51 | 
52 |     benchmark.pedantic(get_features,
53 |                        rounds=100,
54 |                        kwargs={"action": action},
55 |                        iterations=5)
56 | 
57 | 
58 | def test_features_action_cuda(set_action_space,
59 |                               benchmark):
60 |     if not torch.cuda.is_available():
61 |         pytest.skip("CUDA is not available")
62 | 
63 |     raw = set_action_space
64 |     action = Action(raw.to("cuda"))
65 | 
66 |     benchmark.pedantic(get_features,
67 |                        rounds=100,
68 |                        kwargs={"action": action},
69 |                        iterations=5)
70 | 


--------------------------------------------------------------------------------
/tests/benchmark/cpu_gpu_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import numpy as np
 4 | from rlil.environments import GymEnvironment, State
 5 | from rlil.presets.continuous import ddpg
 6 | 
 7 | 
 8 | def collect_samples(agent, env):
 9 |     while len(agent.replay_buffer) < 100:
10 |         env.reset()
11 |         while not env.done:
12 |             env.step(agent.act(env.state, env.reward))
13 | 
14 | 
15 | def test_ddpg_cuda(benchmark, use_gpu):
16 |     if not torch.cuda.is_available():
17 |         pytest.skip("CUDA is not available")
18 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
19 |     agent_fn = ddpg(replay_start_size=100)
20 |     agent = agent_fn(env)
21 |     collect_samples(agent, env)
22 |     assert agent.should_train()
23 |     benchmark.pedantic(agent.train, rounds=100)
24 | 
25 | 
26 | def test_ddpg_cpu(benchmark, use_cpu):
27 |     if not torch.cuda.is_available():
28 |         pytest.skip("CUDA is not available")
29 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
30 |     agent_fn = ddpg(replay_start_size=100)
31 |     agent = agent_fn(env)
32 |     collect_samples(agent, env)
33 |     assert agent.should_train()
34 |     benchmark.pedantic(agent.train, rounds=100)


--------------------------------------------------------------------------------
/tests/benchmark/state_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import gym
 3 | import torch
 4 | import numpy as np
 5 | from rlil.environments import Action, State
 6 | import rlil.initializer as init
 7 | 
 8 | 
 9 | def test_create_state_debug(benchmark):
10 |     init.enable_debug_mode()
11 |     assert init.is_debug_mode()
12 | 
13 |     raw = torch.randn(3, 4)
14 |     benchmark.pedantic(State,
15 |                        kwargs={'raw': raw},
16 |                        rounds=100,
17 |                        iterations=5)
18 | 
19 | 
20 | def test_create_state(benchmark):
21 |     init.disable_debug_mode()
22 |     assert not init.is_debug_mode()
23 | 
24 |     raw = torch.randn(3, 4)
25 |     benchmark.pedantic(State,
26 |                        kwargs={'raw': raw},
27 |                        rounds=100,
28 |                        iterations=5)
29 | 


--------------------------------------------------------------------------------
/tests/benchmark/train_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import numpy as np
 4 | from rlil.environments import GymEnvironment, State
 5 | from rlil.presets.continuous import ddpg, sac, td3, bc
 6 | from ..presets.offline_continuous_test import get_transitions
 7 | 
 8 | 
 9 | def collect_samples(agent, env):
10 |     while len(agent.replay_buffer) < 100:
11 |         env.reset()
12 |         while not env.done:
13 |             env.step(agent.act(env.state, env.reward))
14 | 
15 | 
16 | def test_ddpg(benchmark):
17 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
18 |     agent_fn = ddpg(replay_start_size=100)
19 |     agent = agent_fn(env)
20 |     collect_samples(agent, env)
21 |     assert agent.should_train()
22 |     benchmark.pedantic(agent.train, rounds=100)
23 | 
24 | 
25 | def test_sac(benchmark):
26 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
27 |     agent_fn = sac(replay_start_size=100)
28 |     agent = agent_fn(env)
29 |     collect_samples(agent, env)
30 |     assert agent.should_train()
31 |     benchmark.pedantic(agent.train, rounds=100)
32 | 
33 | 
34 | def test_td3(benchmark):
35 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
36 |     agent_fn = td3(replay_start_size=100)
37 |     agent = agent_fn(env)
38 |     collect_samples(agent, env)
39 |     assert agent.should_train()
40 |     benchmark.pedantic(agent.train, rounds=100)
41 | 
42 | 
43 | def test_bc(benchmark):
44 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
45 |     transitions = get_transitions(env)
46 |     agent_fn = bc(transitions)
47 |     agent = agent_fn(env)
48 |     assert len(transitions["obs"]) > 100
49 |     benchmark.pedantic(agent.train, rounds=100)
50 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import torch
 4 | from rlil.initializer import set_seed, enable_debug_mode, set_device, get_device
 5 | from rlil import nn
 6 | from rlil.environments import Action
 7 | from rlil.policies.deterministic import DeterministicPolicyNetwork
 8 | from rlil.memory import ExperienceReplayBuffer
 9 | 
10 | 
11 | @pytest.fixture(scope="function", autouse=True)
12 | def seed():
13 |     """set random seed for testing"""
14 |     set_seed(0)
15 | 
16 | 
17 | @pytest.fixture(scope="function", autouse=True)
18 | def debug():
19 |     enable_debug_mode()
20 | 
21 | 
22 | @pytest.fixture(scope="function", autouse=True)
23 | def reset_action_space():
24 |     Action._action_space = None
25 | 
26 | 
27 | @pytest.fixture
28 | def use_cpu():
29 |     pre_device = get_device()
30 |     set_device("cpu")
31 |     yield
32 |     set_device(pre_device)
33 | 
34 | 
35 | @pytest.fixture
36 | def use_gpu():
37 |     pre_device = get_device()
38 |     set_device("cuda")
39 |     yield
40 |     set_device(pre_device)
41 | 


--------------------------------------------------------------------------------
/tests/environments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/environments/__init__.py


--------------------------------------------------------------------------------
/tests/environments/action_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import torch
  4 | import torch_testing as tt
  5 | import gym
  6 | from rlil.environments.action import Action, action_decorator
  7 | 
  8 | 
  9 | @pytest.fixture()
 10 | def set_continuous_action_space():
 11 |     action_space = gym.spaces.Box(
 12 |         low=np.array([-1, -10]), high=np.array([1, 10]))
 13 |     Action.set_action_space(action_space)
 14 | 
 15 | 
 16 | @pytest.fixture()
 17 | def set_discrete_action_space():
 18 |     action_space = gym.spaces.Discrete(4)
 19 |     Action.set_action_space(action_space)
 20 | 
 21 | 
 22 | def test_set_action_space_raises():
 23 |     """ 
 24 |     Action class should raise when the action_space is not set
 25 |     """
 26 |     with pytest.raises(AssertionError):
 27 |         Action(torch.Tensor([[2, 3]]))
 28 | 
 29 | 
 30 | def test_continuous_action(set_continuous_action_space):
 31 |     # GIVEN a set action_space
 32 | 
 33 |     # WHEN a new Action object with valid input is made
 34 |     # THEN the raw is equal to Action.raw
 35 |     raw = torch.tensor([[0, 0], [2, 2], [-20, -20]], dtype=torch.float32)
 36 |     action = Action(raw)
 37 |     tt.assert_equal(action.raw, raw)
 38 | 
 39 |     # WHEN a new Action object with a raw outside the action_space
 40 |     # THEN the action.features should clipped in the range
 41 |     tt.assert_equal(action.features, torch.tensor(
 42 |         [[0, 0], [1, 2], [-1, -10]], dtype=torch.float32))
 43 | 
 44 |     # WHEN a new Action object with invalid input is made
 45 |     # THEN raise a assertion error
 46 |     with pytest.raises(AssertionError):
 47 |         raw = torch.randn(3, 5)
 48 |         action = Action(raw)
 49 | 
 50 | 
 51 | def test_discrete_action(set_discrete_action_space):
 52 |     # GIVEN a set action_space
 53 | 
 54 |     # WHEN a new Action object with valid input is made
 55 |     # THEN the raw is equal to Action.raw
 56 |     raw = torch.tensor([1, 2, 3, 0]).unsqueeze(1)
 57 |     action = Action(raw)
 58 |     tt.assert_equal(action.raw, raw)
 59 | 
 60 |     # WHEN a new Action object with invalid input is made
 61 |     # THEN raise a assertion error
 62 |     with pytest.raises(AssertionError):
 63 |         raw = torch.tensor([5])
 64 |         action = Action(raw)
 65 | 
 66 | 
 67 | def test_from_list(set_continuous_action_space):
 68 |     action1 = Action(torch.randn(1, 2))
 69 |     action2 = Action(torch.randn(1, 2))
 70 |     action3 = Action(torch.randn(1, 2))
 71 |     action = Action.from_list([action1, action2, action3])
 72 |     tt.assert_equal(action.raw, torch.cat(
 73 |         (action1.raw, action2.raw, action3.raw)))
 74 | 
 75 | 
 76 | def test_from_numpy(set_continuous_action_space):
 77 |     actions = np.array([[1, 2]])
 78 |     action = Action.from_numpy(actions)
 79 |     tt.assert_equal(action.raw, torch.tensor([[1, 2]]))
 80 | 
 81 | 
 82 | def test_raw_numpy(set_continuous_action_space):
 83 |     actions = np.array([[1, 2]])
 84 |     action = Action.from_numpy(actions)
 85 |     np.testing.assert_equal(actions, action.raw_numpy())
 86 | 
 87 | 
 88 | def test_get_item():
 89 |     action_space = gym.spaces.Box(low=np.array(
 90 |         [-1, -2, -3, -4]), high=np.array([1, 2, 3, 4]))
 91 |     Action.set_action_space(action_space)
 92 |     raw = torch.randn(3, 4)
 93 |     actions = Action(raw)
 94 |     action = actions[2]
 95 |     tt.assert_equal(action.raw, raw[2].unsqueeze(0))
 96 | 
 97 | 
 98 | def test_len():
 99 |     action_space = gym.spaces.Box(low=np.array(
100 |         [-1, -2, -3, -4]), high=np.array([1, 2, 3, 4]))
101 |     Action.set_action_space(action_space)
102 |     action = Action(torch.randn(3, 4))
103 |     assert len(action) == 3
104 | 
105 | 
106 | def test_action_decorator():
107 |     action_space = gym.spaces.Box(low=-1, high=1, shape=(2, ))
108 |     Action.set_action_space(action_space)
109 |     @action_decorator
110 |     def act():
111 |         return torch.tensor([3, 4]).unsqueeze(0)
112 | 
113 |     action = act()
114 |     tt.assert_equal(action.raw, torch.tensor([3, 4]).unsqueeze(0))
115 | 


--------------------------------------------------------------------------------
/tests/environments/gym_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | from rlil.environments.gym import GymEnvironment
 4 | from rlil.environments import State, Action
 5 | import torch
 6 | import gym
 7 | 
 8 | 
 9 | def test_env_discrete():
10 |     env = gym.make('CartPole-v0')
11 |     env = GymEnvironment(env)
12 |     env.reset()
13 |     while not env._state.done:
14 |         action = Action.action_space().sample()
15 |         action = Action(torch.tensor([action]).unsqueeze(0))
16 |         state, reward = env.step(action)
17 | 
18 | 
19 | def test_env_continuous():
20 |     env = gym.make('LunarLanderContinuous-v2')
21 |     env = GymEnvironment(env)
22 |     env.reset()
23 |     while not env._state.done:
24 |         action = Action.action_space().sample()
25 |         action = Action(torch.tensor([action]))
26 |         state, reward = env.step(action)
27 | 
28 | 
29 | def test_append_time():
30 |     env = gym.make('LunarLanderContinuous-v2')
31 |     env = GymEnvironment(env, append_time=True)
32 |     state = env.reset()
33 |     last_timestep = state.raw[0, -1].item()
34 |     while not env._state.done:
35 |         action = Action.action_space().sample()
36 |         action = Action(torch.tensor([action]))
37 |         state, reward = env.step(action)
38 |         assert state.raw[0, -1].item() > last_timestep
39 |         last_timestep = state.raw[0, -1].item()
40 |     assert state.shape[1] == env._env.observation_space.shape[0] + 1
41 | 


--------------------------------------------------------------------------------
/tests/environments/state_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import torch
  4 | import torch_testing as tt
  5 | from rlil.environments.state import State
  6 | 
  7 | DONE = torch.tensor(
  8 |     [0],
  9 |     dtype=torch.bool,
 10 | )
 11 | 
 12 | NOT_DONE = torch.tensor(
 13 |     [1],
 14 |     dtype=torch.bool,
 15 | )
 16 | 
 17 | 
 18 | def test_constructor_defaults():
 19 |     raw = torch.randn(3, 4)
 20 |     state = State(raw)
 21 |     # state.features returns raw
 22 |     tt.assert_equal(state.features, raw)
 23 |     # state.mask returns 1 default
 24 |     tt.assert_equal(state.mask, torch.ones(3, dtype=torch.bool))
 25 |     # state.raw == raw
 26 |     tt.assert_equal(state.raw, raw)
 27 |     assert state.info == [None] * 3
 28 | 
 29 | 
 30 | def test_custom_constructor_args():
 31 |     raw = torch.randn(3, 4)
 32 |     mask = torch.zeros(3).bool()
 33 |     info = ['a', 'b', 'c']
 34 |     state = State(raw, mask=mask, info=info)
 35 |     tt.assert_equal(state.features, raw)
 36 |     # check zeros masks
 37 |     tt.assert_equal(state.mask, torch.zeros(3, dtype=torch.bool))
 38 |     # check info constructor
 39 |     assert state.info == info
 40 | 
 41 | 
 42 | def test_not_done():
 43 |     state = State(torch.randn(1, 4))
 44 |     assert not state.done
 45 | 
 46 | 
 47 | def test_done():
 48 |     raw = torch.randn(1, 4)
 49 |     state = State(raw, mask=DONE)
 50 |     assert state.done
 51 | 
 52 | 
 53 | def test_from_list():
 54 |     state1 = State(torch.randn(1, 4), mask=DONE, info=['a'])
 55 |     state2 = State(torch.randn(1, 4), mask=NOT_DONE, info=['b'])
 56 |     state3 = State(torch.randn(1, 4))
 57 |     state = State.from_list([state1, state2, state3])
 58 |     tt.assert_equal(state.raw, torch.cat(
 59 |         (state1.raw, state2.raw, state3.raw)))
 60 |     tt.assert_equal(state.mask, torch.tensor([0, 1, 1]))
 61 |     assert state.info == ['a', 'b', None]
 62 | 
 63 | 
 64 | def test_from_numpy():
 65 |     gym_obs = np.array([1, 2, 3])
 66 |     done = True
 67 |     info = ['a']
 68 |     with pytest.raises(AssertionError):
 69 |         state = State.from_numpy(gym_obs, done, info)
 70 |     gym_obs = np.random.randn(3, 5)
 71 |     done = np.zeros(3, dtype=np.bool)
 72 |     info = ['a']
 73 |     state = State.from_numpy(gym_obs, done, info)
 74 | 
 75 |     tt.assert_equal(state.raw, torch.tensor(gym_obs, dtype=torch.float32), )
 76 |     tt.assert_equal(state.done, torch.tensor(done))
 77 |     assert state.info == ['a']
 78 | 
 79 | 
 80 | def test_raw_numpy():
 81 |     np_raws = np.random.randn(3, 4)
 82 |     np_masks = np.ones(3)
 83 |     state = State(torch.tensor(np_raws), mask=torch.tensor(np_masks))
 84 |     out_np_raws, out_np_dones = state.raw_numpy()
 85 |     np.testing.assert_equal(np_raws, out_np_raws)
 86 |     np.testing.assert_equal(np_masks, ~out_np_dones)
 87 | 
 88 | 
 89 | def test_get_item():
 90 |     raw = torch.randn(3, 4)
 91 |     states = State(raw)
 92 |     state = states[2]
 93 |     tt.assert_equal(state.raw, raw[2].unsqueeze(0))
 94 |     tt.assert_equal(state.mask, NOT_DONE)
 95 |     assert state.info == [None]
 96 | 
 97 | 
 98 | def test_len():
 99 |     state = State(torch.randn(3, 4))
100 |     assert len(state) == 3
101 | 


--------------------------------------------------------------------------------
/tests/experiments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/experiments/__init__.py


--------------------------------------------------------------------------------
/tests/experiments/experiment_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import torch
 4 | import ray
 5 | from rlil.presets.continuous import sac
 6 | from rlil.environments import GymEnvironment
 7 | from rlil.experiments import Experiment, Trainer
 8 | from rlil.utils.writer import Writer
 9 | from rlil.initializer import set_writer
10 | from rlil.samplers import AsyncSampler
11 | 
12 | 
13 | class MockWriter(Writer):
14 |     def __init__(self, label):
15 |         self.data = {}
16 |         self.label = label
17 |         self.sample_frames = 0
18 |         self.sample_episodes = 1
19 |         self.train_steps = 0
20 | 
21 |     def add_scalar(self, key, value, step="sample_frames"):
22 |         key = key + "/" + step
23 |         if key not in self.data:
24 |             self.data[key] = {"values": [], "steps": []}
25 |         self.data[key]["values"].append(value)
26 |         self.data[key]["steps"].append(self._get_step_value(step))
27 | 
28 |     def add_text(self, name, text, step="sample_frames"):
29 |         pass
30 | 
31 |     def _get_step_value(self, _type):
32 |         if _type == "sample_frames":
33 |             return self.sample_frames
34 |         if _type == "sample_episodes":
35 |             return self.sample_episodes
36 |         if _type == "train_steps":
37 |             return self.train_steps
38 |         return _type
39 | 
40 | 
41 | class MockExperiment(Experiment):
42 |     def __init__(
43 |             self,
44 |             agent_fn,
45 |             env,
46 |             exp_info='default_experiments',
47 |             num_workers=1,
48 |             max_sample_frames=np.inf,
49 |             max_sample_episodes=np.inf,
50 |     ):
51 | 
52 |         # set writer
53 |         agent_name = agent_fn.__name__
54 |         writer = self._make_writer(agent_name, env.name, exp_info)
55 |         set_writer(writer)
56 | 
57 |         # start training
58 |         agent = agent_fn(env)
59 | 
60 |         sampler = AsyncSampler(env, num_workers=num_workers)
61 |         eval_sampler = AsyncSampler(env)
62 | 
63 |         trainer = Trainer(
64 |             agent=agent,
65 |             sampler=sampler,
66 |             eval_sampler=eval_sampler,
67 |             max_sample_frames=max_sample_frames,
68 |             max_sample_episodes=max_sample_episodes
69 |         )
70 | 
71 |         trainer.start_training()
72 | 
73 |     def _make_writer(self, agent_name, env_name,
74 |                      exp_info="default_experiments"):
75 |         self._writer = MockWriter(agent_name + '_' + env_name)
76 |         return self._writer
77 | 
78 | 
79 | def test_adds_label():
80 |     ray.init(include_webui=False, ignore_reinit_error=True)
81 |     env = GymEnvironment('Pendulum-v0', append_time=True)
82 |     experiment = MockExperiment(sac(), env, max_sample_episodes=1)
83 |     assert experiment._writer.label == "_sac_Pendulum-v0"
84 | 
85 | 
86 | @pytest.mark.skip()
87 | def test_writes_returns_eps():
88 |     ray.init(include_webui=False, ignore_reinit_error=True)
89 |     env = GymEnvironment('Pendulum-v0', append_time=True)
90 |     experiment = MockExperiment(sac(), env, max_sample_episodes=3)
91 |     np.testing.assert_equal(
92 |         experiment._writer.data["returns/episode"]["steps"],
93 |         np.array([1, 2, 3]),
94 |     )
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     unittest.main()
99 | 


--------------------------------------------------------------------------------
/tests/experiments/trainer_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import torch
 4 | from torch.optim import Adam
 5 | import gym
 6 | import time
 7 | import ray
 8 | from rlil.environments import GymEnvironment
 9 | from rlil import nn
10 | from rlil.experiments import Trainer
11 | from rlil.samplers import AsyncSampler
12 | from rlil.memory import ExperienceReplayBuffer
13 | from rlil.initializer import set_replay_buffer
14 | from rlil.presets.continuous import sac
15 | from ..mock_agent import MockAgent
16 | 
17 | 
18 | @pytest.fixture
19 | def setUp():
20 |     ray.init(include_webui=False, ignore_reinit_error=True)
21 | 
22 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
23 | 
24 |     replay_buffer_size = 100000
25 |     replay_buffer = ExperienceReplayBuffer(replay_buffer_size, env)
26 |     set_replay_buffer(replay_buffer)
27 | 
28 |     agent = MockAgent(env)
29 |     num_workers = 3
30 |     sampler = AsyncSampler(
31 |         env,
32 |         num_workers=num_workers,
33 |     )
34 | 
35 |     yield env, agent, sampler
36 | 
37 | 
38 | def test_trainer_frames(setUp):
39 |     max_sample_frames = 100
40 |     env, agent, sampler = setUp
41 |     trainer = Trainer(agent, sampler, max_sample_frames=max_sample_frames)
42 |     trainer.start_training()
43 |     assert trainer._writer.sample_frames > max_sample_frames
44 | 
45 | 
46 | def test_trainer_episodes(setUp):
47 |     max_sample_episodes = 5
48 |     env, agent, sampler = setUp
49 |     trainer = Trainer(agent, sampler, max_sample_episodes=max_sample_episodes)
50 |     trainer.start_training()
51 |     assert trainer._writer.sample_frames > max_sample_episodes
52 | 
53 | 
54 | def test_training(setUp):
55 |     env, agent, sampler = setUp
56 |     agent_fn = sac(replay_start_size=50)
57 |     agent = agent_fn(env)
58 | 
59 |     trainer = Trainer(agent, sampler, max_sample_episodes=5)
60 |     trainer.start_training()
61 | 


--------------------------------------------------------------------------------
/tests/memory/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/memory/__init__.py


--------------------------------------------------------------------------------
/tests/memory/gae_wrapper_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import random
 3 | import torch
 4 | from torch.optim import Adam
 5 | import numpy as np
 6 | import gym
 7 | import torch_testing as tt
 8 | from rlil.approximation import VNetwork, FeatureNetwork
 9 | from rlil.environments import State, Action, GymEnvironment
10 | from rlil.memory import ExperienceReplayBuffer, GaeWrapper
11 | from rlil.presets.continuous.models import fc_actor_critic
12 | from rlil.utils import Samples
13 | 
14 | 
15 | class DummyFeatures:
16 |     def target(self, states):
17 |         return states
18 | 
19 | 
20 | class DummyV:
21 |     def target(self, feature):
22 |         return torch.ones(len(feature))
23 | 
24 | 
25 | @pytest.fixture
26 | def setUp(use_cpu):
27 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
28 |     buffer = ExperienceReplayBuffer(1000, env)
29 |     gae_buffer = GaeWrapper(buffer, discount_factor=1, lam=0.3)
30 | 
31 |     # base buffer
32 |     states = [env.observation_space.sample() for i in range(4)]
33 |     actions = [env.action_space.sample() for i in range(3)]
34 |     states = State(torch.tensor(states))
35 |     states, next_states = states[:-1], states[1:]
36 |     actions = Action(torch.tensor(actions))
37 |     rewards = torch.arange(0, 3, dtype=torch.float)
38 |     samples = Samples(states, actions, rewards, next_states)
39 |     gae_buffer.store(samples)
40 | 
41 |     feature_nw = DummyFeatures()
42 |     v = DummyV()
43 |     yield gae_buffer, feature_nw, v
44 | 
45 | 
46 | def test_advantage(setUp):
47 |     gae_buffer, feature_nw, v = setUp
48 | 
49 |     states, _, rewards, next_states, _, _ = gae_buffer.get_all_transitions()
50 |     values = v.target(feature_nw.target(states))
51 |     next_values = v.target(feature_nw.target(next_states))
52 |     advantages = gae_buffer.compute_gae(rewards, values,
53 |                                         next_values, next_states.mask)
54 | 
55 |     # rewards: [0, 1, 2]
56 |     # td_errors: [0, 1, 2]
57 |     expected = torch.tensor([0 + 1 * 0.3 + 2 * 0.3 * 0.3,
58 |                              1 + 2 * 0.3,
59 |                              2])
60 |     tt.assert_almost_equal(
61 |         advantages,
62 |         (expected - expected.mean()) / expected.std(), decimal=3)
63 | 


--------------------------------------------------------------------------------
/tests/memory/gail_wrapper_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import random
 3 | import torch
 4 | from torch.optim import Adam
 5 | import numpy as np
 6 | import gym
 7 | import torch_testing as tt
 8 | from rlil.environments import State, Action, GymEnvironment
 9 | from rlil.memory import ExperienceReplayBuffer, GailWrapper
10 | from rlil.presets.continuous.models import fc_discriminator
11 | from rlil.approximation import Discriminator
12 | from rlil.initializer import set_device
13 | from rlil.utils import Samples
14 | 
15 | 
16 | @pytest.fixture
17 | def setUp(use_cpu):
18 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
19 |     replay_buffer = ExperienceReplayBuffer(1000, env)
20 | 
21 |     # base buffer
22 |     states = State(torch.tensor([env.observation_space.sample()]*100))
23 |     actions = Action(torch.tensor([env.action_space.sample()]*99))
24 |     rewards = torch.arange(0, 99, dtype=torch.float)
25 |     samples = Samples(states[:-1], actions, rewards, states[1:])
26 |     replay_buffer.store(samples)
27 | 
28 |     # expert buffer
29 |     exp_replay_buffer = ExperienceReplayBuffer(1000, env)
30 |     exp_states = State(torch.tensor([env.observation_space.sample()]*100))
31 |     exp_actions = Action(torch.tensor([env.action_space.sample()]*99))
32 |     exp_rewards = torch.arange(100, 199, dtype=torch.float)
33 |     exp_samples = Samples(
34 |         exp_states[:-1], exp_actions, exp_rewards, exp_states[1:])
35 |     exp_replay_buffer.store(exp_samples)
36 |     # discriminator
37 |     discriminator_model = fc_discriminator(env)
38 |     discriminator_optimizer = Adam(discriminator_model.parameters())
39 |     discriminator = Discriminator(discriminator_model,
40 |                                   discriminator_optimizer)
41 | 
42 |     gail_buffer = GailWrapper(replay_buffer, exp_replay_buffer, discriminator)
43 | 
44 |     samples = {
45 |         "buffer": {"states": states,
46 |                    "actions": actions,
47 |                    "rewards": rewards},
48 |         "expert": {"states": states,
49 |                    "actions": actions,
50 |                    "rewards": rewards},
51 |     }
52 |     yield gail_buffer, samples
53 | 
54 | 
55 | def test_sample(setUp):
56 |     gail_buffer, samples = setUp
57 |     res_states, res_actions, res_rewards, res_next_states, _, _ = \
58 |         gail_buffer.sample(4)
59 | 
60 |     # test states
61 |     tt.assert_equal(res_states.features[0],
62 |                     samples["buffer"]["states"].features[0])
63 | 
64 |     # test actions
65 |     tt.assert_equal(res_actions.features[0],
66 |                     samples["buffer"]["actions"].features[0])
67 | 
68 |     # test next_states
69 |     tt.assert_equal(
70 |         res_next_states.features[0], samples["buffer"]["states"].features[0])
71 | 
72 | 
73 | def test_sample_both(setUp):
74 |     gail_buffer, samples = setUp
75 |     samples, expert_samples = gail_buffer.sample_both(4)
76 | 
77 | 
78 | def test_store(setUp):
79 |     gail_buffer, samples = setUp
80 |     assert len(gail_buffer) == 99
81 | 
82 |     gail_samples = Samples(samples["buffer"]["states"][:-1],
83 |                            samples["buffer"]["actions"],
84 |                            samples["buffer"]["rewards"],
85 |                            samples["buffer"]["states"][1:])
86 |     gail_buffer.store(gail_samples)
87 | 
88 |     assert len(gail_buffer) == 198
89 | 
90 | 
91 | def test_clear(setUp):
92 |     gail_buffer, samples = setUp
93 |     gail_buffer.clear()
94 |     assert len(gail_buffer) == 0
95 |     assert len(gail_buffer.expert_buffer) != 0
96 | 


--------------------------------------------------------------------------------
/tests/memory/sqil_wrapper_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import random
 3 | import torch
 4 | from torch.optim import Adam
 5 | import numpy as np
 6 | import gym
 7 | import torch_testing as tt
 8 | from rlil.environments import State, Action, GymEnvironment
 9 | from rlil.memory import ExperienceReplayBuffer, SqilWrapper
10 | from rlil.initializer import set_device
11 | from rlil.utils import Samples
12 | 
13 | 
14 | @pytest.fixture
15 | def setUp(use_cpu):
16 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
17 |     replay_buffer = ExperienceReplayBuffer(1000, env)
18 | 
19 |     # base buffer
20 |     states = State(torch.tensor([env.observation_space.sample()]*10))
21 |     actions = Action(torch.tensor([env.action_space.sample()]*9))
22 |     rewards = torch.arange(0, 9, dtype=torch.float)
23 |     samples = Samples(states[:-1], actions, rewards, states[1:])
24 |     replay_buffer.store(samples)
25 | 
26 |     # expert buffer
27 |     exp_replay_buffer = ExperienceReplayBuffer(1000, env)
28 |     exp_states = State(torch.tensor([env.observation_space.sample()]*10))
29 |     exp_actions = Action(torch.tensor([env.action_space.sample()]*9))
30 |     exp_rewards = torch.arange(10, 19, dtype=torch.float)
31 |     exp_samples = Samples(
32 |         exp_states[:-1], exp_actions, exp_rewards, exp_states[1:])
33 |     exp_replay_buffer.store(exp_samples)
34 |     sqil_buffer = SqilWrapper(replay_buffer, exp_replay_buffer)
35 | 
36 |     samples = {
37 |         "buffer": {"states": states,
38 |                    "actions": actions,
39 |                    "rewards": rewards},
40 |         "expert": {"states": states,
41 |                    "actions": actions,
42 |                    "rewards": rewards},
43 |     }
44 |     yield sqil_buffer, samples
45 | 
46 | 
47 | def test_sample(setUp):
48 |     sqil_buffer, samples = setUp
49 |     res_states, res_actions, res_rewards, res_next_states, _, _ = \
50 |         sqil_buffer.sample(40)
51 | 
52 |     # test rewards
53 |     # half of the rewards are 1 and the others are 0
54 |     assert res_rewards.sum() == len(res_rewards) / 2
55 | 


--------------------------------------------------------------------------------
/tests/mock_agent.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | import torch
 4 | from rlil import nn
 5 | from rlil.environments import Action
 6 | from rlil.policies.deterministic import DeterministicPolicyNetwork
 7 | from rlil.memory import ExperienceReplayBuffer
 8 | from rlil.initializer import get_replay_buffer, get_n_step
 9 | from rlil.utils import Samples
10 | 
11 | 
12 | class MockAgent:
13 |     def __init__(self, env):
14 |         model = nn.Sequential(
15 |             nn.Flatten(),
16 |             nn.Linear(env.state_space.shape[0],
17 |                       Action.action_space().shape[0])
18 |         )
19 |         self.policy_model = DeterministicPolicyNetwork(
20 |             model, Action.action_space())
21 | 
22 |         self._state = None
23 |         self._action = None
24 |         self.replay_buffer = get_replay_buffer()
25 | 
26 |     def act(self, state, reward):
27 |         samples = Samples(self._state, self._action, reward, state)
28 |         self.replay_buffer.store(samples)
29 |         self._state = state
30 | 
31 |         with torch.no_grad():
32 |             action = self.policy_model(
33 |                 state.to(self.policy_model.device))
34 | 
35 |         self._action = Action(action).to("cpu")
36 |         return self._action
37 | 
38 |     def make_lazy_agent(self):
39 |         return MockLazyAgent(self.policy_model)
40 | 
41 |     def train(self):
42 |         pass
43 | 
44 | 
45 | class MockLazyAgent:
46 |     def __init__(self, policy_model):
47 |         self._state = None
48 |         self._action = None
49 |         self.policy_model = policy_model
50 |         self.replay_buffer = None
51 |         # for N step replay buffer
52 |         self._n_step, self._discount_factor = get_n_step()
53 | 
54 |     def set_replay_buffer(self, env):
55 |         self.replay_buffer = ExperienceReplayBuffer(
56 |             1e7, env, n_step=self._n_step,
57 |             discount_factor=self._discount_factor)
58 | 
59 |     def act(self, state, reward):
60 |         samples = Samples(self._state, self._action, reward, state)
61 |         self.replay_buffer.store(samples)
62 |         self._state = state
63 | 
64 |         with torch.no_grad():
65 |             action = self.policy_model(
66 |                 state.to(self.policy_model.device))
67 | 
68 |         self._action = Action(action).to("cpu")
69 |         return self._action
70 | 
71 |     def compute_priorities(self, samples):
72 |         return None
73 | 


--------------------------------------------------------------------------------
/tests/nn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/nn/__init__.py


--------------------------------------------------------------------------------
/tests/policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/policies/__init__.py


--------------------------------------------------------------------------------
/tests/policies/bcq_deterministic_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import unittest
  3 | import torch
  4 | import torch_testing as tt
  5 | import numpy as np
  6 | from gym.spaces import Box
  7 | from rlil import nn
  8 | from rlil.approximation import FixedTarget
  9 | from rlil.environments import State, Action, squash_action
 10 | from rlil.policies import BCQDeterministicPolicy
 11 | 
 12 | STATE_DIM = 2
 13 | ACTION_DIM = 3
 14 | 
 15 | 
 16 | class TestBCQDeterministic(unittest.TestCase):
 17 |     def setUp(self):
 18 |         torch.manual_seed(2)
 19 |         self.model = nn.Sequential(
 20 |             nn.Linear0(STATE_DIM + ACTION_DIM, ACTION_DIM)
 21 |         )
 22 |         self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
 23 |         self.space = Box(np.array([-1, -1, -1]),
 24 |                          np.array([1, 1, 1]), dtype=np.float32)
 25 |         self.policy = BCQDeterministicPolicy(
 26 |             self.model,
 27 |             self.optimizer,
 28 |             self.space
 29 |         )
 30 |         Action.set_action_space(self.space)
 31 | 
 32 |     def test_output_shape(self):
 33 |         state = State(torch.randn(1, STATE_DIM))
 34 |         vae_action = Action(torch.randn(1, ACTION_DIM))
 35 |         action = self.policy(state, vae_action)
 36 |         self.assertEqual(action.shape, (1, ACTION_DIM))
 37 |         state = State(torch.randn(5, STATE_DIM))
 38 |         vae_action = Action(torch.randn(5, ACTION_DIM))
 39 |         action = self.policy(state, vae_action)
 40 |         self.assertEqual(action.shape, (5, ACTION_DIM))
 41 | 
 42 |     def test_step_one(self):
 43 |         state = State(torch.randn(1, STATE_DIM))
 44 |         vae_action = Action(torch.randn(1, ACTION_DIM))
 45 |         self.policy(state, vae_action)
 46 |         self.policy.step()
 47 | 
 48 |     @pytest.mark.skip
 49 |     def test_converge(self):
 50 |         state = State(torch.randn(1, STATE_DIM))
 51 |         vae_action = Action(torch.randn(1, ACTION_DIM))
 52 |         target = vae_action.features + torch.tensor([[0.25, 0.5, -0.5]])
 53 | 
 54 |         for _ in range(0, 200):
 55 |             action = self.policy(state, vae_action)
 56 |             loss = ((target - action) ** 2).mean()
 57 |             loss.backward()
 58 |             self.policy.step()
 59 | 
 60 |         self.assertLess(loss, 0.001)
 61 | 
 62 |     @pytest.mark.skip
 63 |     def test_target(self):
 64 |         self.policy = BCQDeterministicPolicy(
 65 |             self.model,
 66 |             self.optimizer,
 67 |             self.space,
 68 |             target=FixedTarget(3)
 69 |         )
 70 | 
 71 |         # choose initial action
 72 |         state = State(torch.ones(1, STATE_DIM))
 73 |         vae_action = Action(torch.ones(1, ACTION_DIM))
 74 |         action = self.policy(state, vae_action)
 75 |         tt.assert_equal(action, squash_action(
 76 |             vae_action.features, action_space=self.space))
 77 | 
 78 |         # run update step, make sure target network doesn't change
 79 |         action.sum().backward(retain_graph=True)
 80 |         self.policy.step()
 81 |         tt.assert_equal(self.policy.target(state, vae_action),
 82 |                         squash_action(vae_action.features, action_space=self.space))
 83 | 
 84 |         # again...
 85 |         action.sum().backward(retain_graph=True)
 86 |         self.policy.step()
 87 |         tt.assert_equal(self.policy.target(state, vae_action),
 88 |                         squash_action(vae_action.features, action_space=self.space))
 89 | 
 90 |         # third time, target should be updated
 91 |         action.sum().backward(retain_graph=True)
 92 |         self.policy.step()
 93 |         # tt.assert_allclose(
 94 |         #     self.policy.eval(state, vae_action),
 95 |         #     torch.tensor([[-0.595883, -0.595883, -0.595883]]),
 96 |         #     atol=1e-4,
 97 |         # )
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     unittest.main()
102 | 


--------------------------------------------------------------------------------
/tests/policies/deterministic_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import unittest
 3 | import torch
 4 | import torch_testing as tt
 5 | import numpy as np
 6 | from gym.spaces import Box
 7 | from rlil import nn
 8 | from rlil.approximation import FixedTarget
 9 | from rlil.environments import State
10 | from rlil.policies import DeterministicPolicy
11 | 
12 | STATE_DIM = 2
13 | ACTION_DIM = 3
14 | 
15 | 
16 | class TestDeterministic(unittest.TestCase):
17 |     def setUp(self):
18 |         torch.manual_seed(2)
19 |         self.model = nn.Sequential(
20 |             nn.Linear0(STATE_DIM, ACTION_DIM)
21 |         )
22 |         self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
23 |         self.space = Box(np.array([-1, -1, -1]),
24 |                          np.array([1, 1, 1]), dtype=np.float32)
25 |         self.policy = DeterministicPolicy(
26 |             self.model,
27 |             self.optimizer,
28 |             self.space
29 |         )
30 | 
31 |     def test_output_shape(self):
32 |         state = State(torch.randn(1, STATE_DIM))
33 |         action = self.policy(state)
34 |         self.assertEqual(action.shape, (1, ACTION_DIM))
35 |         state = State(torch.randn(5, STATE_DIM))
36 |         action = self.policy(state)
37 |         self.assertEqual(action.shape, (5, ACTION_DIM))
38 | 
39 |     def test_step_one(self):
40 |         state = State(torch.randn(1, STATE_DIM))
41 |         self.policy(state)
42 |         self.policy.step()
43 | 
44 |     def test_converge(self):
45 |         state = State(torch.randn(1, STATE_DIM))
46 |         target = torch.tensor([0.25, 0.5, -0.5])
47 | 
48 |         for _ in range(0, 200):
49 |             action = self.policy(state)
50 |             loss = ((target - action) ** 2).mean()
51 |             self.policy.reinforce(loss)
52 | 
53 |         self.assertLess(loss, 0.001)
54 | 
55 |     @pytest.mark.skip
56 |     def test_target(self):
57 |         self.policy = DeterministicPolicy(
58 |             self.model,
59 |             self.optimizer,
60 |             self.space,
61 |             target=FixedTarget(3)
62 |         )
63 | 
64 |         # choose initial action
65 |         state = State(torch.ones(1, STATE_DIM))
66 |         action = self.policy(state)
67 |         tt.assert_equal(action, torch.zeros(1, ACTION_DIM))
68 | 
69 |         # run update step, make sure target network doesn't change
70 |         action.sum().backward(retain_graph=True)
71 |         self.policy.step()
72 |         tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))
73 | 
74 |         # again...
75 |         action.sum().backward(retain_graph=True)
76 |         self.policy.step()
77 |         tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM))
78 | 
79 |         # third time, target should be updated
80 |         action.sum().backward(retain_graph=True)
81 |         self.policy.step()
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     unittest.main()
86 | 


--------------------------------------------------------------------------------
/tests/policies/gaussian_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | import torch
 4 | from torch import nn
 5 | import torch_testing as tt
 6 | from gym.spaces import Box
 7 | from rlil.environments import State
 8 | from rlil.policies import GaussianPolicy
 9 | 
10 | STATE_DIM = 2
11 | ACTION_DIM = 3
12 | 
13 | 
14 | class TestGaussian(unittest.TestCase):
15 |     def setUp(self):
16 | 
17 |         torch.manual_seed(2)
18 |         self.space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]))
19 |         self.model = nn.Sequential(
20 |             nn.Linear(STATE_DIM, ACTION_DIM * 2)
21 |         )
22 |         optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.01)
23 |         self.policy = GaussianPolicy(self.model, optimizer, self.space)
24 | 
25 |     def test_output_shape(self):
26 |         state = State(torch.randn(1, STATE_DIM))
27 |         action = self.policy(state).sample()
28 |         self.assertEqual(action.shape, (1, ACTION_DIM))
29 |         state = State(torch.randn(5, STATE_DIM))
30 |         action = self.policy(state).sample()
31 |         self.assertEqual(action.shape, (5, ACTION_DIM))
32 | 
33 |     def test_reinforce_one(self):
34 |         state = State(torch.randn(1, STATE_DIM))
35 |         dist = self.policy(state)
36 |         action = dist.sample()
37 |         log_prob1 = dist.log_prob(action)
38 |         loss = -log_prob1.mean()
39 |         self.policy.reinforce(loss)
40 | 
41 |         dist = self.policy(state)
42 |         log_prob2 = dist.log_prob(action)
43 | 
44 |         self.assertGreater(log_prob2.item(), log_prob1.item())
45 | 
46 |     def test_converge(self):
47 |         state = State(torch.randn(1, STATE_DIM))
48 |         target = torch.tensor([1., 2., -1.])
49 | 
50 |         for _ in range(0, 1000):
51 |             dist = self.policy(state)
52 |             action = dist.sample()
53 |             log_prob = dist.log_prob(action)
54 |             error = ((target - action) ** 2).mean()
55 |             loss = (error * log_prob).mean()
56 |             self.policy.reinforce(loss)
57 | 
58 |         self.assertTrue(error < 1)
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     unittest.main()
63 | 


--------------------------------------------------------------------------------
/tests/policies/soft_deterministic_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | import torch
 4 | from torch import nn
 5 | import torch_testing as tt
 6 | from gym.spaces import Box
 7 | from rlil.environments import State
 8 | from rlil.policies import SoftDeterministicPolicy
 9 | 
10 | STATE_DIM = 2
11 | ACTION_DIM = 3
12 | 
13 | 
14 | @pytest.fixture
15 | def setUp():
16 |     torch.manual_seed(2)
17 |     space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]))
18 |     model = nn.Sequential(
19 |         nn.Linear(STATE_DIM, ACTION_DIM * 2)
20 |     )
21 |     optimizer = torch.optim.RMSprop(model.parameters(), lr=0.01)
22 |     policy = SoftDeterministicPolicy(model, optimizer, space)
23 |     yield policy
24 | 
25 | 
26 | def test_output_shape(setUp):
27 |     policy = setUp
28 |     state = State(torch.randn(1, STATE_DIM))
29 |     action, _ = policy(state)
30 |     assert action.shape == (1, ACTION_DIM)
31 | 
32 |     state = State(torch.randn(5, STATE_DIM))
33 |     action, _ = policy(state)
34 |     assert action.shape == (5, ACTION_DIM)
35 | 
36 | 
37 | def test_reinforce_one(setUp):
38 |     policy = setUp
39 |     state = State(torch.randn(1, STATE_DIM))
40 |     action, log_prob1 = policy(state)
41 |     loss = -log_prob1.mean()
42 |     policy.reinforce(loss)
43 | 
44 |     action, log_prob2 = policy(state)
45 | 
46 |     assert log_prob2.item() > log_prob1.item()
47 | 
48 | 
49 | def test_sample_multiple(setUp):
50 |     policy = setUp
51 |     state = State(torch.randn(5, STATE_DIM))
52 |     actions, raw_actions = policy.sample_multiple(state, num_sample=10)
53 |     assert actions.shape == (5, 10, ACTION_DIM)
54 |     assert raw_actions.shape == (5, 10, ACTION_DIM)
55 | 


--------------------------------------------------------------------------------
/tests/policies/softmax_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import torch
 3 | from torch import nn
 4 | import torch_testing as tt
 5 | from rlil.environments import State
 6 | from rlil.policies import SoftmaxPolicy
 7 | 
 8 | STATE_DIM = 2
 9 | ACTIONS = 3
10 | 
11 | 
12 | class TestSoftmax(unittest.TestCase):
13 |     def setUp(self):
14 |         torch.manual_seed(2)
15 |         self.model = nn.Sequential(
16 |             nn.Linear(STATE_DIM, ACTIONS)
17 |         )
18 |         optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1)
19 |         self.policy = SoftmaxPolicy(self.model, optimizer)
20 | 
21 |     def test_run(self):
22 |         state1 = State(torch.randn(1, STATE_DIM))
23 |         dist1 = self.policy(state1)
24 |         action1 = dist1.sample()
25 |         log_prob1 = dist1.log_prob(action1)
26 |         self.assertEqual(action1.item(), 0)
27 | 
28 |         state2 = State(torch.randn(1, STATE_DIM))
29 |         dist2 = self.policy(state2)
30 |         action2 = dist2.sample()
31 |         log_prob2 = dist2.log_prob(action2)
32 |         self.assertEqual(action2.item(), 2)
33 | 
34 |         loss = -(torch.tensor([-1, 1000000]) *
35 |                  torch.cat((log_prob1, log_prob2))).mean()
36 |         self.policy.reinforce(loss)
37 | 
38 |         state3 = State(torch.randn(1, STATE_DIM))
39 |         dist3 = self.policy(state3)
40 |         action3 = dist3.sample()
41 |         self.assertEqual(action3.item(), 2)
42 | 
43 |     def test_multi_action(self):
44 |         states = State(torch.randn(3, STATE_DIM))
45 |         actions = self.policy(states).sample()
46 |         tt.assert_equal(actions, torch.tensor([2, 2, 0]))
47 | 
48 |     def test_list(self):
49 |         torch.manual_seed(1)
50 |         states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 0, 1]))
51 |         dist = self.policy(states)
52 |         actions = dist.sample()
53 |         log_probs = dist.log_prob(actions)
54 |         tt.assert_equal(actions, torch.tensor([1, 2, 1]))
55 |         loss = -(torch.tensor([[1, 2, 3]]) * log_probs).mean()
56 |         self.policy.reinforce(loss)
57 | 
58 |     def test_reinforce(self):
59 |         def loss(log_probs):
60 |             return -log_probs.mean()
61 | 
62 |         states = State(torch.randn(3, STATE_DIM), torch.tensor([1, 1, 1]))
63 |         actions = self.policy.eval(states).sample()
64 | 
65 |         # notice the values increase with each successive reinforce
66 |         log_probs = self.policy(states).log_prob(actions)
67 |         tt.assert_almost_equal(log_probs, torch.tensor(
68 |             [-0.84, -0.62, -0.757]), decimal=3)
69 |         self.policy.reinforce(loss(log_probs))
70 |         log_probs = self.policy(states).log_prob(actions)
71 |         tt.assert_almost_equal(log_probs, torch.tensor(
72 |             [-0.811, -0.561, -0.701]), decimal=3)
73 |         self.policy.reinforce(loss(log_probs))
74 |         log_probs = self.policy(states).log_prob(actions)
75 |         tt.assert_almost_equal(log_probs, torch.tensor(
76 |             [-0.785, -0.51, -0.651]), decimal=3)
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     unittest.main()
81 | 


--------------------------------------------------------------------------------
/tests/presets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/presets/__init__.py


--------------------------------------------------------------------------------
/tests/presets/offline_continuous_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import gym
 3 | from rlil.environments import GymEnvironment
 4 | from rlil.presets.continuous import bcq, bc, vae_bc, bear, brac
 5 | from rlil.presets import env_validation, trainer_validation
 6 | from rlil.memory import ExperienceReplayBuffer
 7 | from rlil.environments import Action
 8 | from rlil.initializer import set_replay_buffer
 9 | from copy import deepcopy
10 | from ..mock_agent import MockAgent
11 | 
12 | 
13 | def get_transitions(env):
14 |     replay_buffer = ExperienceReplayBuffer(1000, env)
15 |     set_replay_buffer(replay_buffer)
16 |     agent = MockAgent(env)
17 | 
18 |     while len(agent.replay_buffer) < 200:
19 |         env.reset()
20 |         while not env.done:
21 |             env.step(agent.act(env.state, env.reward))
22 | 
23 |     return agent.replay_buffer.get_all_transitions(return_cpprb=True)
24 | 
25 | 
26 | def test_bcq():
27 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
28 |     transitions = get_transitions(env)
29 |     assert len(transitions["obs"]) > 100
30 | 
31 |     env_validation(bcq(transitions), env, done_step=50)
32 |     trainer_validation(bcq(transitions), env)
33 | 
34 | 
35 | def test_bear():
36 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
37 |     transitions = get_transitions(env)
38 |     assert len(transitions["obs"]) > 100
39 | 
40 |     env_validation(bear(transitions), env, done_step=50)
41 |     trainer_validation(bear(transitions), env)
42 | 
43 | 
44 | def test_brac():
45 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
46 |     transitions = get_transitions(env)
47 |     assert len(transitions["obs"]) > 100
48 | 
49 |     env_validation(brac(transitions, bc_iters=5), env, done_step=50)
50 |     trainer_validation(brac(transitions, bc_iters=5), env)
51 | 
52 | 
53 | def test_bc():
54 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
55 |     transitions = get_transitions(env)
56 |     assert len(transitions["obs"]) > 100
57 | 
58 |     env_validation(bc(transitions), env, done_step=50)
59 |     trainer_validation(bc(transitions), env)
60 | 
61 | 
62 | def test_vae_bc():
63 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
64 |     transitions = get_transitions(env)
65 |     assert len(transitions["obs"]) > 100
66 | 
67 |     env_validation(vae_bc(transitions), env, done_step=50)
68 |     trainer_validation(vae_bc(transitions), env)
69 | 


--------------------------------------------------------------------------------
/tests/presets/online_continuous_test.py:
--------------------------------------------------------------------------------
 1 | import ptvsd
 2 | import pytest
 3 | from rlil.environments import GymEnvironment
 4 | from rlil.presets.continuous import vac, ddpg, sac, td3, noisy_td3, ppo, rs_mpc
 5 | from rlil.presets import env_validation, trainer_validation
 6 | from rlil.initializer import set_device
 7 | 
 8 | 
 9 | def test_vac():
10 |     env = GymEnvironment("LunarLanderContinuous-v2", append_time=True)
11 |     env_validation(vac(replay_start_size=50), env, done_step=50)
12 |     trainer_validation(vac(replay_start_size=50), env)
13 | 
14 | 
15 | def test_ddpg():
16 |     env = GymEnvironment("LunarLanderContinuous-v2", append_time=True)
17 |     env_validation(ddpg(replay_start_size=50), env, done_step=50)
18 |     trainer_validation(ddpg(replay_start_size=50), env)
19 | 
20 | 
21 | def test_sac(use_cpu):
22 |     env = GymEnvironment("LunarLanderContinuous-v2", append_time=True)
23 |     env_validation(sac(replay_start_size=50), env, done_step=50)
24 |     trainer_validation(sac(replay_start_size=50), env)
25 | 
26 | 
27 | def test_n_step():
28 |     env = GymEnvironment("LunarLanderContinuous-v2", append_time=True)
29 |     for preset in [ddpg, td3, sac]:
30 |         agent_fn = preset(n_step=5)
31 |         agent = agent_fn(env)
32 |         lazy_agent = agent.make_lazy_agent()
33 |         lazy_agent.set_replay_buffer(env)
34 |         assert lazy_agent._n_step == 5
35 | 
36 | 
37 | def test_prioritized():
38 |     env = GymEnvironment("LunarLanderContinuous-v2", append_time=True)
39 |     for preset in [ddpg, td3, sac]:
40 |         env_validation(preset(prioritized=True, replay_start_size=50),
41 |                        env, done_step=50)
42 | 
43 | 
44 | def test_td3():
45 |     env = GymEnvironment("LunarLanderContinuous-v2", append_time=True)
46 |     env_validation(td3(replay_start_size=50), env, done_step=50)
47 |     trainer_validation(td3(replay_start_size=50), env)
48 | 
49 | 
50 | def test_noisy_td3():
51 |     env = GymEnvironment("LunarLanderContinuous-v2", append_time=True)
52 |     env_validation(noisy_td3(replay_start_size=50), env, done_step=50)
53 |     trainer_validation(noisy_td3(replay_start_size=50), env)
54 | 
55 | 
56 | def test_ppo():
57 |     env = GymEnvironment("LunarLanderContinuous-v2", append_time=True)
58 |     env_validation(ppo(replay_start_size=5), env, done_step=50)
59 |     trainer_validation(ppo(replay_start_size=50), env)
60 | 
61 | 
62 | def test_rs_mpc():
63 |     env = GymEnvironment("Pendulum-v0", append_time=True)
64 |     env_validation(rs_mpc(replay_start_size=5), env, done_step=50)
65 |     trainer_validation(rs_mpc(replay_start_size=5), env)
66 | 
67 | 
68 | def test_apex(use_cpu):
69 |     env = GymEnvironment("LunarLanderContinuous-v2", append_time=True)
70 |     for preset in [ddpg, td3, sac]:
71 |         trainer_validation(
72 |             preset(replay_start_size=5, use_apex=True), env, apex=True)
73 | 


--------------------------------------------------------------------------------
/tests/presets/online_il_continuous_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import gym
 3 | from rlil.environments import GymEnvironment
 4 | from rlil.presets.continuous import airl, gail, sqil, td3, sac, ppo
 5 | from rlil.presets import env_validation, trainer_validation
 6 | from rlil.memory import ExperienceReplayBuffer
 7 | from rlil.environments import Action
 8 | from rlil.initializer import set_replay_buffer, get_writer
 9 | from copy import deepcopy
10 | from ..mock_agent import MockAgent
11 | 
12 | 
13 | def get_transitions(env):
14 |     replay_buffer = ExperienceReplayBuffer(1000, env)
15 |     set_replay_buffer(replay_buffer)
16 |     agent = MockAgent(env)
17 | 
18 |     while len(agent.replay_buffer) < 100:
19 |         env.reset()
20 |         while not env.done:
21 |             env.step(agent.act(env.state, env.reward))
22 | 
23 |     return agent.replay_buffer.get_all_transitions(return_cpprb=True)
24 | 
25 | 
26 | def test_gail(use_cpu):
27 |     env = GymEnvironment("LunarLanderContinuous-v2", append_time=True)
28 |     transitions = get_transitions(env)
29 |     base_agent_fn = td3(replay_start_size=0)
30 |     assert len(transitions["obs"]) > 100
31 | 
32 |     env_validation(gail(transitions=transitions,
33 |                         base_agent_fn=base_agent_fn,
34 |                         replay_start_size=10), env, done_step=50)
35 |     trainer_validation(gail(transitions=transitions,
36 |                             base_agent_fn=base_agent_fn,
37 |                             replay_start_size=10), env)
38 | 
39 |     writer = get_writer()
40 |     assert writer.train_steps > 1
41 | 
42 | 
43 | def test_sqil(use_cpu):
44 |     env = GymEnvironment("LunarLanderContinuous-v2", append_time=True)
45 |     transitions = get_transitions(env)
46 |     base_agent_fn = sac(replay_start_size=0)
47 |     assert len(transitions["obs"]) > 100
48 | 
49 |     env_validation(sqil(transitions=transitions,
50 |                         base_agent_fn=base_agent_fn,
51 |                         replay_start_size=10), env, done_step=50)
52 |     trainer_validation(sqil(transitions=transitions,
53 |                             base_agent_fn=base_agent_fn,
54 |                             replay_start_size=10), env)
55 | 
56 |     writer = get_writer()
57 |     assert writer.train_steps > 1
58 | 
59 | 
60 | def test_airl():
61 |     env = GymEnvironment("LunarLanderContinuous-v2", append_time=True)
62 |     transitions = get_transitions(env)
63 |     base_agent_fn = ppo(replay_start_size=0)
64 |     assert len(transitions["obs"]) > 100
65 | 
66 |     env_validation(airl(transitions=transitions,
67 |                         base_agent_fn=base_agent_fn,
68 |                         replay_start_size=10), env, done_step=50)
69 |     trainer_validation(airl(transitions=transitions,
70 |                             base_agent_fn=base_agent_fn,
71 |                             replay_start_size=10), env)
72 | 
73 |     writer = get_writer()
74 |     assert writer.train_steps > 1
75 | 


--------------------------------------------------------------------------------
/tests/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/samplers/__init__.py


--------------------------------------------------------------------------------
/tests/samplers/asyncsampler_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | import torch
  4 | import gym
  5 | import time
  6 | import warnings
  7 | import ray
  8 | from rlil import nn
  9 | from rlil.environments import GymEnvironment, Action
 10 | from rlil.policies.deterministic import DeterministicPolicyNetwork
 11 | from rlil.samplers import AsyncSampler, StartInfo
 12 | from rlil.memory import ExperienceReplayBuffer
 13 | from rlil.initializer import set_replay_buffer
 14 | from ..mock_agent import MockAgent
 15 | 
 16 | 
 17 | @pytest.fixture()
 18 | def setUp():
 19 |     ray.init(include_webui=False, ignore_reinit_error=True)
 20 | 
 21 |     env = GymEnvironment('LunarLanderContinuous-v2', append_time=True)
 22 | 
 23 |     replay_buffer_size = 100000
 24 |     replay_buffer = ExperienceReplayBuffer(replay_buffer_size, env)
 25 |     set_replay_buffer(replay_buffer)
 26 | 
 27 |     agent = MockAgent(env)
 28 | 
 29 |     yield {"env": env, "agent": agent}
 30 | 
 31 | 
 32 | def test_sampler_episode(setUp):
 33 |     env = setUp["env"]
 34 |     agent = setUp["agent"]
 35 | 
 36 |     num_workers = 3
 37 |     worker_episodes = 6
 38 |     sampler = AsyncSampler(
 39 |         env,
 40 |         num_workers=num_workers,
 41 |     )
 42 |     lazy_agent = agent.make_lazy_agent()
 43 |     sampler.start_sampling(
 44 |         lazy_agent, worker_episodes=worker_episodes)
 45 |     sample_result = sampler.store_samples(timeout=1e8)
 46 | 
 47 |     # GIVEN the store_samples function with infinite timeout
 48 |     # WHEN worker_episodes are specified
 49 |     # THEN sampler collects samples by the num of num_workers * worker_episodes
 50 |     assert len(sample_result[StartInfo()]["frames"]
 51 |                ) == num_workers * worker_episodes
 52 | 
 53 | 
 54 | def test_sampler_frames(setUp):
 55 |     env = setUp["env"]
 56 |     agent = setUp["agent"]
 57 | 
 58 |     num_workers = 3
 59 |     worker_frames = 50
 60 |     sampler = AsyncSampler(
 61 |         env,
 62 |         num_workers=num_workers,
 63 |     )
 64 | 
 65 |     lazy_agent = agent.make_lazy_agent()
 66 |     sampler.start_sampling(
 67 |         lazy_agent, worker_frames=worker_frames)
 68 |     sample_result = sampler.store_samples(timeout=1e8)
 69 | 
 70 |     # GIVEN the store_samples function with infinite timeout
 71 |     # WHEN worker_frames are specified
 72 |     # THEN sampler collects samples until frames exceeds worker_frames * num_workers
 73 |     assert sum(sample_result[StartInfo()]["frames"]
 74 |                ) > worker_frames * num_workers
 75 | 
 76 | 
 77 | def test_ray_wait(setUp):
 78 |     env = setUp["env"]
 79 |     agent = setUp["agent"]
 80 |     sampler = AsyncSampler(
 81 |         env,
 82 |         num_workers=3,
 83 |     )
 84 | 
 85 |     worker_episodes = 100
 86 |     lazy_agent = agent.make_lazy_agent()
 87 |     sampler.start_sampling(
 88 |         lazy_agent, worker_episodes=worker_episodes)
 89 |     sampler.store_samples(timeout=0.1)
 90 | 
 91 |     # GIVEN the store_samples function with short timeout
 92 |     # WHEN worker_episodes is large
 93 |     # THEN sampler doesn't wait the worker finishes sampling
 94 |     assert len(sampler.replay_buffer) == 0
 95 | 
 96 | 
 97 | def test_eval_sampler(setUp):
 98 |     env = setUp["env"]
 99 |     agent = setUp["agent"]
100 |     sampler = AsyncSampler(
101 |         env,
102 |         num_workers=3,
103 |     )
104 | 
105 |     worker_episodes = 3
106 |     lazy_agent = agent.make_lazy_agent()
107 |     start_info = StartInfo(sample_frames=100,
108 |                            sample_episodes=1000,
109 |                            train_steps=10000)
110 |     sampler.start_sampling(
111 |         lazy_agent,
112 |         worker_episodes=worker_episodes,
113 |         start_info=start_info
114 |     )
115 | 
116 |     result = sampler.store_samples(timeout=1e9, evaluation=True)
117 |     # when evaluation=True, sampler doesn't store samples to the replay_buffer
118 |     assert len(sampler.replay_buffer) == 0
119 | 
120 |     result["info_list"]
121 | 


--------------------------------------------------------------------------------
/tests/utils/writer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/syuntoku14/pytorch-rl-il/e613b73acdb634f2f7a00ac88e7aa3431c1852bf/tests/utils/writer/__init__.py


--------------------------------------------------------------------------------
/tests/utils/writer/writer_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | from rlil.utils.writer import ExperimentWriter
  4 | from rlil.initializer import set_writer, get_writer
  5 | from shutil import rmtree
  6 | import pathlib
  7 | import os
  8 | import pandas as pd
  9 | from tensorboard.backend.event_processing import event_accumulator
 10 | 
 11 | 
 12 | @pytest.fixture()
 13 | def init_writer():
 14 |     writer = ExperimentWriter(agent_name="test_agent",
 15 |                               env_name="test_env",
 16 |                               exp_info="test_exp",
 17 |                               sample_frame_interval=10,
 18 |                               sample_episode_interval=100,
 19 |                               train_step_interval=1000)
 20 | 
 21 |     # GIVEN sample_frame_interval == 10
 22 |     # WHEN add_scalar with step="sample_frames" is called
 23 |     # THEN sample_frames is saved every 10 samples
 24 |     for i in range(100):
 25 |         writer.sample_frames = i
 26 |         writer.add_scalar("test", i, step="sample_frames")
 27 | 
 28 |     # same test for sample_episodes
 29 |     for i in range(1000):
 30 |         writer.sample_episodes = i
 31 |         writer.add_scalar("test", i, step="sample_episodes")
 32 | 
 33 |     # same test for train_steps
 34 |     for i in range(10000):
 35 |         writer.train_steps = i
 36 |         writer.add_scalar("test", i, step="train_steps")
 37 | 
 38 |     set_writer(writer)
 39 | 
 40 |     # load events file
 41 |     test_path = pathlib.Path("runs/test_exp")
 42 |     for p in test_path.rglob("events*"):
 43 |         eventspath = p
 44 |     event_acc = event_accumulator.EventAccumulator(
 45 |         str(eventspath), size_guidance={'scalars': 0})
 46 | 
 47 |     # test make dir
 48 |     assert os.path.isdir(str(test_path))
 49 | 
 50 |     yield event_acc
 51 | 
 52 |     # rm test_exp dir
 53 |     writer.close()
 54 |     rmtree(str(test_path), ignore_errors=True)
 55 | 
 56 | 
 57 | def read_scalars(event_acc):
 58 |     scalars = {}
 59 |     steps = {}
 60 | 
 61 |     for tag in event_acc.Tags()['scalars']:
 62 |         events = event_acc.Scalars(tag)
 63 |         scalars[tag] = [event.value for event in events]
 64 |         steps[tag] = [event.step for event in events]
 65 |     return steps, scalars
 66 | 
 67 | 
 68 | def test_get_step_value(init_writer):
 69 |     writer = get_writer()
 70 |     writer.sample_frames = 1
 71 |     writer.sample_episodes = 2
 72 |     writer.train_steps = 3
 73 | 
 74 |     assert writer._get_step_value("sample_frames") == 1
 75 |     assert writer._get_step_value("sample_episodes") == 2
 76 |     assert writer._get_step_value("train_steps") == 3
 77 | 
 78 | 
 79 | def test_add_scalar_interval(init_writer):
 80 |     writer = get_writer()
 81 |     writer.close()
 82 | 
 83 |     event_acc = init_writer
 84 |     event_acc.Reload()
 85 | 
 86 |     steps, scalars = read_scalars(event_acc)
 87 |     assert scalars['test_env/test/sample_frames'] == [
 88 |         i for i in range(10, 100, 10)]
 89 | 
 90 |     assert scalars['test_env/test/sample_episodes'] == [
 91 |         i for i in range(100, 1000, 100)]
 92 | 
 93 |     assert scalars['test_env/test/train_steps'] == [
 94 |         i for i in range(1000, 10000, 1000)]
 95 | 
 96 | 
 97 | def test_step_value(init_writer):
 98 |     writer = get_writer()
 99 |     writer.sample_frames = 1e9
100 |     writer.add_scalar("test", 999, step="sample_frames", step_value=12345)
101 |     writer.close()
102 | 
103 |     event_acc = init_writer
104 |     event_acc.Reload()
105 |     steps, scalars = read_scalars(event_acc)
106 |     assert 12345 in steps["test_env/test/sample_frames"]
107 | 
108 | 
109 | def test_save_csv(init_writer):
110 |     writer = get_writer()
111 |     writer.sample_frames = 1e9
112 |     writer.add_scalar("test", 500, step="sample_frames", save_csv=True)
113 | 
114 |     test_path = pathlib.Path("runs/test_exp")
115 |     for p in test_path.rglob("*.csv"):
116 |         csv_file = p
117 | 
118 |     csv_data = pd.read_csv(str(csv_file), names=["sample_frames", "return"])
119 |     assert csv_data["sample_frames"].tolist() == [1e9]
120 | 


--------------------------------------------------------------------------------