├── MANIFEST.in ├── assets ├── ChainerRL.png ├── breakout.gif ├── grasping.gif └── humanoid.gif ├── chainerrl ├── v_functions │ ├── __init__.py │ └── v_functions.py ├── initializers │ ├── __init__.py │ ├── normal.py │ └── constant.py ├── envs │ ├── __init__.py │ └── serial_vector_env.py ├── optimizers │ ├── __init__.py │ ├── nonbias_weight_decay.py │ └── rmsprop_async.py ├── q_functions │ ├── __init__.py │ └── dueling_dqn.py ├── policies │ ├── __init__.py │ ├── mellowmax_policy.py │ └── softmax_policy.py ├── functions │ ├── __init__.py │ ├── scale_grad.py │ ├── bound_by_tanh.py │ ├── sum_arrays.py │ ├── weighted_sum_arrays.py │ ├── invert_gradients.py │ └── mellowmax.py ├── misc │ ├── ask_yes_no.py │ ├── __init__.py │ ├── makedirs.py │ ├── batch_states.py │ ├── is_return_code_zero.py │ ├── init_like_torch.py │ ├── random.py │ ├── reward_filter.py │ ├── conjugate_gradient.py │ ├── random_seed.py │ ├── draw_computational_graph.py │ ├── env_modifiers.py │ └── copy_param.py ├── explorers │ ├── __init__.py │ ├── greedy.py │ ├── additive_gaussian.py │ ├── boltzmann.py │ ├── additive_ou.py │ └── epsilon_greedy.py ├── wrappers │ ├── __init__.py │ ├── scale_reward.py │ ├── render.py │ ├── cast_observation.py │ ├── continuing_time_limit.py │ └── randomize_action.py ├── links │ ├── __init__.py │ ├── mlp.py │ ├── noisy_chain.py │ ├── sequence.py │ ├── dqn_head.py │ ├── noisy_linear.py │ └── mlp_bn.py ├── v_function.py ├── q_function.py ├── policy.py ├── experiments │ ├── __init__.py │ └── hooks.py ├── explorer.py ├── agents │ ├── __init__.py │ ├── double_dqn.py │ ├── sarsa.py │ ├── residual_dqn.py │ ├── double_pal.py │ ├── al.py │ ├── pal.py │ └── dpp.py ├── env.py └── __init__.py ├── readthedocs.yml ├── .gitignore ├── requirements-dev.txt ├── docs ├── reference.rst ├── recurrent.rst ├── experiments.rst ├── action_values.rst ├── install.rst ├── distributions.rst ├── Makefile ├── agents.rst ├── make.bat └── index.rst ├── requirements.txt ├── examples ├── README.md ├── ale │ ├── README.md │ └── dqn_phi.py ├── gym │ └── README.md ├── grasping │ └── README.md └── atari │ └── dqn │ └── README.md ├── tests ├── misc_tests │ ├── test_is_return_code_zero.py │ ├── test_conjugate_gradient.py │ ├── test_batch_states.py │ ├── test_random_seed.py │ ├── test_copy_param.py │ ├── test_collections.py │ ├── test_draw_computational_graph.py │ └── test_random.py ├── explorers_tests │ ├── test_additive_gaussian.py │ ├── test_additive_ou.py │ ├── test_boltzmann.py │ └── test_epsilon_greedy.py ├── experiments_tests │ ├── test_hooks.py │ └── test_train_agent.py ├── q_functions_tests │ └── basetest_state_action_q_function.py ├── agents_tests │ ├── test_ddpg.py │ ├── test_sarsa.py │ ├── test_pgt.py │ ├── test_al.py │ ├── test_pal.py │ ├── test_double_pal.py │ ├── test_double_dqn.py │ ├── basetest_agents.py │ ├── test_residual_dqn.py │ └── test_dpp.py ├── links_tests │ ├── test_noisy_chain.py │ ├── test_mlp_bn.py │ ├── test_sequence.py │ ├── test_noisy_linear.py │ └── test_empirical_normalization.py ├── wrappers_tests │ ├── test_scale_reward.py │ ├── test_continuing_time_limit.py │ ├── test_cast_observation.py │ ├── test_render.py │ └── test_randomize_action.py ├── optimizer_tests │ └── test_nonbias_weight_decay.py ├── functions_tests │ ├── test_sum_arrays.py │ ├── test_weighted_sum_arrays.py │ ├── test_lower_triangular_matrix.py │ └── test_invert_gradients.py ├── test_ale.py ├── envs_tests │ └── test_vector_envs.py └── test_agent.py ├── setup.py ├── LICENSE ├── CONTRIBUTING.md ├── .travis.yml └── tools └── plot_scores.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | -------------------------------------------------------------------------------- /assets/ChainerRL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imos/chainerrl/master/assets/ChainerRL.png -------------------------------------------------------------------------------- /assets/breakout.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imos/chainerrl/master/assets/breakout.gif -------------------------------------------------------------------------------- /assets/grasping.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imos/chainerrl/master/assets/grasping.gif -------------------------------------------------------------------------------- /assets/humanoid.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imos/chainerrl/master/assets/humanoid.gif -------------------------------------------------------------------------------- /chainerrl/v_functions/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.v_functions.v_functions import * # NOQA 2 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | name: chainerrl 2 | type: sphinx 3 | base: docs 4 | python: 5 | setup_py_install: true 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .ipynb_checkpoints 3 | chainerrl.egg-info 4 | build/ 5 | dist/ 6 | .idea/ 7 | results/ 8 | examples/gym/results/ 9 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | autopep8 3 | atari_py 4 | flake8 5 | mock 6 | opencv-python 7 | pytest 8 | sphinx 9 | sphinx_rtd_theme 10 | -------------------------------------------------------------------------------- /chainerrl/initializers/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.initializers.constant import VarianceScalingConstant # NOQA 2 | from chainerrl.initializers.normal import LeCunNormal # NOQA 3 | -------------------------------------------------------------------------------- /chainerrl/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.envs.multiprocess_vector_env import MultiprocessVectorEnv # NOQA 2 | from chainerrl.envs.serial_vector_env import SerialVectorEnv # NOQA 3 | -------------------------------------------------------------------------------- /chainerrl/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.optimizers.nonbias_weight_decay import NonbiasWeightDecay # noqa 2 | from chainerrl.optimizers.rmsprop_async import RMSpropAsync # noqa 3 | -------------------------------------------------------------------------------- /docs/reference.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | API Reference 3 | ============= 4 | 5 | .. toctree:: 6 | :maxdepth: 1 7 | 8 | action_values 9 | agents 10 | distributions 11 | experiments 12 | recurrent 13 | -------------------------------------------------------------------------------- /chainerrl/q_functions/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.q_functions.dueling_dqn import * # NOQA 2 | from chainerrl.q_functions.state_action_q_functions import * # NOQA 3 | from chainerrl.q_functions.state_q_functions import * # NOQA 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cached-property 2 | chainer>=3.1.0 3 | fastcache; python_version<'3.2' 4 | funcsigs; python_version<'3.5' 5 | future 6 | gym>=0.9.7 7 | numpy>=1.10.4 8 | pillow 9 | scipy 10 | statistics; python_version<'3.4' 11 | -------------------------------------------------------------------------------- /chainerrl/policies/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.policies.deterministic_policy import * # NOQA 2 | from chainerrl.policies.gaussian_policy import * # NOQA 3 | from chainerrl.policies.mellowmax_policy import * # NOQA 4 | from chainerrl.policies.softmax_policy import * # NOQA 5 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | - `ale`: examples for Atari 2600 games in the Arcade Learning Environment 4 | - `gym`: examples for OpenAI Gym environments 5 | - `grasping`: examples for a Bullet-based robotic grasping environment 6 | - `quickstart`: a quickstart guide of ChainerRL 7 | -------------------------------------------------------------------------------- /chainerrl/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.functions.sum_arrays import sum_arrays # NOQA 2 | from chainerrl.functions.sum_arrays import SumArrays # NOQA 3 | from chainerrl.functions.weighted_sum_arrays import weighted_sum_arrays # NOQA 4 | from chainerrl.functions.weighted_sum_arrays import WeightedSumArrays # NOQA 5 | -------------------------------------------------------------------------------- /chainerrl/misc/ask_yes_no.py: -------------------------------------------------------------------------------- 1 | from builtins import * # NOQA 2 | 3 | 4 | def ask_yes_no(question): 5 | while True: 6 | choice = input("{} [y/N]: ".format(question)).lower() 7 | if choice in ['y', 'ye', 'yes']: 8 | return True 9 | elif choice in ['n', 'no']: 10 | return False 11 | -------------------------------------------------------------------------------- /docs/recurrent.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | Using recurrent models 3 | ====================== 4 | 5 | Recurrent model interface 6 | ========================= 7 | 8 | .. autoclass:: chainerrl.recurrent.Recurrent 9 | :members: 10 | 11 | Utilities 12 | ========= 13 | 14 | .. autofunction:: chainerrl.recurrent.state_kept 15 | 16 | .. autofunction:: chainerrl.recurrent.state_reset 17 | -------------------------------------------------------------------------------- /chainerrl/explorers/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.explorers.additive_gaussian import AdditiveGaussian # NOQA 2 | from chainerrl.explorers.additive_ou import AdditiveOU # NOQA 3 | from chainerrl.explorers.boltzmann import Boltzmann # NOQA 4 | from chainerrl.explorers.epsilon_greedy import ConstantEpsilonGreedy # NOQA 5 | from chainerrl.explorers.epsilon_greedy import LinearDecayEpsilonGreedy # NOQA 6 | from chainerrl.explorers.greedy import Greedy # NOQA 7 | -------------------------------------------------------------------------------- /docs/experiments.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Experiments 3 | =========== 4 | 5 | Training and evaluation 6 | ======================= 7 | 8 | .. autofunction:: chainerrl.experiments.train_agent_async 9 | 10 | .. autofunction:: chainerrl.experiments.train_agent_with_evaluation 11 | 12 | Training hooks 13 | ============== 14 | 15 | .. autoclass:: chainerrl.experiments.StepHook 16 | :members: 17 | 18 | .. autoclass:: chainerrl.experiments.LinearInterpolationHook 19 | -------------------------------------------------------------------------------- /chainerrl/initializers/normal.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import numpy as np 3 | 4 | 5 | class LeCunNormal(chainer.initializers.HeNormal): 6 | """LeCunNormal is (essentially) the default initializer in Chainer v1. 7 | 8 | chainer.initializers.LeCunNormal is not available yet. 9 | (Chainer Pull Request #2764 has not been merged.) 10 | """ 11 | 12 | def __init__(self, scale=1.0, dtype=None): 13 | super(LeCunNormal, self).__init__(np.sqrt(0.5) * scale, dtype) 14 | -------------------------------------------------------------------------------- /docs/action_values.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Action values 3 | ============= 4 | 5 | Action value interfaces 6 | ======================= 7 | 8 | .. autoclass:: chainerrl.action_value.ActionValue 9 | :members: 10 | 11 | Action value implementations 12 | ============================ 13 | 14 | .. autoclass:: chainerrl.action_value.DiscreteActionValue 15 | 16 | .. autoclass:: chainerrl.action_value.QuadraticActionValue 17 | 18 | .. autoclass:: chainerrl.action_value.SingleActionValue 19 | -------------------------------------------------------------------------------- /chainerrl/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.wrappers.cast_observation import CastObservation # NOQA 2 | from chainerrl.wrappers.cast_observation import CastObservationToFloat32 # NOQA 3 | 4 | from chainerrl.wrappers.continuing_time_limit import ContinuingTimeLimit # NOQA 5 | 6 | from chainerrl.wrappers.randomize_action import RandomizeAction # NOQA 7 | 8 | from chainerrl.wrappers.render import Render # NOQA 9 | 10 | from chainerrl.wrappers.scale_reward import ScaleReward # NOQA 11 | -------------------------------------------------------------------------------- /chainerrl/links/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.links.dqn_head import NatureDQNHead # NOQA 2 | from chainerrl.links.dqn_head import NIPSDQNHead # NOQA 3 | from chainerrl.links.empirical_normalization import EmpiricalNormalization # NOQA 4 | from chainerrl.links.mlp import MLP # NOQA 5 | from chainerrl.links.mlp_bn import MLPBN # NOQA 6 | from chainerrl.links.noisy_chain import to_factorized_noisy # NOQA 7 | from chainerrl.links.noisy_linear import FactorizedNoisyLinear # NOQA 8 | from chainerrl.links.sequence import Sequence # NOQA 9 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | How to install ChainerRL 6 | ======================== 7 | 8 | ChainerRL is tested with Python 2.7+ and 3.5.1+. For other requirements, see ``requirements.txt``. 9 | 10 | .. literalinclude:: ../requirements.txt 11 | :caption: requirements.txt 12 | 13 | ChainerRL can be installed via PyPI, 14 | 15 | :: 16 | 17 | pip install chainerrl 18 | 19 | or through the source code: 20 | 21 | :: 22 | 23 | git clone https://github.com/chainer/chainerrl.git 24 | cd chainerrl 25 | python setup.py install 26 | -------------------------------------------------------------------------------- /docs/distributions.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | Distributions 3 | ============= 4 | 5 | Distribution interfaces 6 | ======================= 7 | 8 | .. autoclass:: chainerrl.distribution.Distribution 9 | :members: 10 | 11 | 12 | Distribution implementations 13 | ============================ 14 | 15 | .. autoclass:: chainerrl.distribution.GaussianDistribution 16 | 17 | .. autoclass:: chainerrl.distribution.SoftmaxDistribution 18 | 19 | .. autoclass:: chainerrl.distribution.MellowmaxDistribution 20 | 21 | .. autoclass:: chainerrl.distribution.ContinuousDeterministicDistribution 22 | -------------------------------------------------------------------------------- /chainerrl/v_function.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from abc import ABCMeta 10 | from abc import abstractmethod 11 | 12 | from future.utils import with_metaclass 13 | 14 | 15 | class VFunction(with_metaclass(ABCMeta, object)): 16 | 17 | @abstractmethod 18 | def __call__(self, x): 19 | raise NotImplementedError() 20 | -------------------------------------------------------------------------------- /chainerrl/misc/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.misc.batch_states import batch_states # NOQA 2 | from chainerrl.misc.conjugate_gradient import conjugate_gradient # NOQA 3 | from chainerrl.misc.draw_computational_graph import collect_variables # NOQA 4 | from chainerrl.misc.draw_computational_graph import draw_computational_graph # NOQA 5 | from chainerrl.misc.draw_computational_graph import is_graphviz_available # NOQA 6 | from chainerrl.misc import env_modifiers # NOQA 7 | from chainerrl.misc.is_return_code_zero import is_return_code_zero # NOQA 8 | from chainerrl.misc.random_seed import set_random_seed # NOQA 9 | -------------------------------------------------------------------------------- /chainerrl/explorers/greedy.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from chainerrl import explorer 10 | 11 | 12 | class Greedy(explorer.Explorer): 13 | """No exploration""" 14 | 15 | def select_action(self, t, greedy_action_func, action_value=None): 16 | return greedy_action_func() 17 | 18 | def __repr__(self): 19 | return 'Greedy()' 20 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = ChainerRL 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /chainerrl/misc/makedirs.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import os 10 | import six 11 | 12 | 13 | def makedirs(name, mode=0o777, exist_ok=False): 14 | """An wrapper of os.makedirs that accepts exist_ok.""" 15 | if six.PY2: 16 | try: 17 | os.makedirs(name, mode) 18 | except OSError: 19 | if not os.path.isdir(name): 20 | raise 21 | else: 22 | os.makedirs(name, mode, exist_ok=exist_ok) 23 | -------------------------------------------------------------------------------- /chainerrl/initializers/constant.py: -------------------------------------------------------------------------------- 1 | from chainer import initializer 2 | from chainer.initializers import Constant 3 | import numpy 4 | 5 | 6 | class VarianceScalingConstant(initializer.Initializer): 7 | def __init__(self, scale=1.0, dtype=None): 8 | super(VarianceScalingConstant, self).__init__(dtype) 9 | self.scale = scale 10 | 11 | def __call__(self, array): 12 | if self.dtype is not None: 13 | assert array.dtype == self.dtype 14 | 15 | if len(array.shape) == 1: 16 | Constant(self.scale / numpy.sqrt(array.shape[0]))(array) 17 | else: 18 | fan_in, _ = initializer.get_fans(array.shape) 19 | 20 | Constant(self.scale / numpy.sqrt(fan_in))(array) 21 | -------------------------------------------------------------------------------- /chainerrl/misc/batch_states.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | 3 | 4 | def batch_states(states, xp, phi): 5 | """The default method for making batch of observations. 6 | 7 | Args: 8 | states (list): list of observations from an environment. 9 | xp (module): numpy or cupy 10 | phi (callable): Feature extractor applied to observations 11 | 12 | Return: 13 | the object which will be given as input to the model. 14 | """ 15 | if chainer.cuda.available and xp is chainer.cuda.cupy: 16 | # GPU 17 | device = chainer.cuda.Device().id 18 | else: 19 | # CPU 20 | device = -1 21 | 22 | features = [phi(s) for s in states] 23 | return chainer.dataset.concat_examples(features, device=device) 24 | -------------------------------------------------------------------------------- /tests/misc_tests/test_is_return_code_zero.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | 8 | import unittest 9 | 10 | import chainerrl 11 | 12 | 13 | class TestIsReturnCodeZero(unittest.TestCase): 14 | 15 | def test(self): 16 | # Assume ls command exists 17 | self.assertTrue(chainerrl.misc.is_return_code_zero(['ls'])) 18 | self.assertFalse(chainerrl.misc.is_return_code_zero( 19 | ['ls --nonexistentoption'])) 20 | self.assertFalse(chainerrl.misc.is_return_code_zero( 21 | ['nonexistentcommand'])) 22 | -------------------------------------------------------------------------------- /examples/ale/README.md: -------------------------------------------------------------------------------- 1 | # Examples for Arcade Learning Environment 2 | 3 | - `train_a3c_ale.py`: A3C 4 | - `train_acer_ale.py`: ACER 5 | - `train_categorical_dqn_ale.py`: CategoricalDQN 6 | - `train_dqn_ale.py`: DQN, DoubleDQN or PAL 7 | - `train_nsq_ale.py`: NSQ (n-step Q-learning) 8 | - `train_ppo_ale.py`: PPO 9 | 10 | ## Requirements 11 | 12 | - atari_py>=0.1.1 13 | - opencv-python 14 | 15 | ## How to run 16 | 17 | ``` 18 | python train_a3c_ale.py n_processes [options] 19 | python train_acer_ale.py n_processes [options] 20 | python train_categorical_dqn_ale.py [options] 21 | python train_dqn_ale.py [options] 22 | python train_nsq_ale.py n_processes [options] 23 | python train_ppo_ale.py [options] 24 | ``` 25 | 26 | Specify `--help` or read code for options. 27 | -------------------------------------------------------------------------------- /chainerrl/q_function.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from abc import ABCMeta 10 | from abc import abstractmethod 11 | 12 | from future.utils import with_metaclass 13 | 14 | 15 | class StateQFunction(with_metaclass(ABCMeta, object)): 16 | 17 | @abstractmethod 18 | def __call__(self, x): 19 | raise NotImplementedError() 20 | 21 | 22 | class StateActionQFunction(with_metaclass(ABCMeta, object)): 23 | 24 | @abstractmethod 25 | def __call__(self, x, a): 26 | raise NotImplementedError() 27 | -------------------------------------------------------------------------------- /chainerrl/policy.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from abc import ABCMeta 10 | from abc import abstractmethod 11 | 12 | from future.utils import with_metaclass 13 | 14 | from logging import getLogger 15 | logger = getLogger(__name__) 16 | 17 | 18 | class Policy(with_metaclass(ABCMeta, object)): 19 | """Abstract policy.""" 20 | 21 | @abstractmethod 22 | def __call__(self, state): 23 | """Evaluate a policy. 24 | 25 | Returns: 26 | Distribution of actions 27 | """ 28 | raise NotImplementedError() 29 | -------------------------------------------------------------------------------- /chainerrl/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.experiments.evaluator import eval_performance # NOQA 2 | 3 | from chainerrl.experiments.hooks import LinearInterpolationHook # NOQA 4 | from chainerrl.experiments.hooks import StepHook # NOQA 5 | 6 | from chainerrl.experiments.prepare_output_dir import is_under_git_control # NOQA 7 | from chainerrl.experiments.prepare_output_dir import prepare_output_dir # NOQA 8 | 9 | from chainerrl.experiments.train_agent import train_agent # NOQA 10 | from chainerrl.experiments.train_agent import train_agent_with_evaluation # NOQA 11 | from chainerrl.experiments.train_agent_async import train_agent_async # NOQA 12 | from chainerrl.experiments.train_agent_batch import train_agent_batch # NOQA 13 | from chainerrl.experiments.train_agent_batch import train_agent_batch_with_evaluation # NOQA 14 | -------------------------------------------------------------------------------- /examples/ale/dqn_phi.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import numpy as np 10 | 11 | 12 | def dqn_phi(screens): 13 | """Phi (feature extractor) of DQN for ALE 14 | 15 | Args: 16 | screens: List of N screen objects. Each screen object must be 17 | numpy.ndarray whose dtype is numpy.uint8. 18 | Returns: 19 | numpy.ndarray 20 | """ 21 | assert len(screens) == 4 22 | assert screens[0].dtype == np.uint8 23 | raw_values = np.asarray(screens, dtype=np.float32) 24 | # [0,255] -> [0, 1] 25 | raw_values /= 255.0 26 | return raw_values 27 | -------------------------------------------------------------------------------- /tests/explorers_tests/test_additive_gaussian.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | import unittest 8 | 9 | import numpy as np 10 | 11 | from chainerrl.explorers.additive_gaussian import AdditiveGaussian 12 | 13 | 14 | class TestAdditiveGaussian(unittest.TestCase): 15 | 16 | def test(self): 17 | 18 | action_size = 3 19 | scale = 0.1 20 | 21 | def greedy_action_func(): 22 | return np.asarray([0] * action_size, dtype=np.float32) 23 | 24 | explorer = AdditiveGaussian(scale) 25 | 26 | for t in range(1000): 27 | a = explorer.select_action(t, greedy_action_func) 28 | print(a) 29 | -------------------------------------------------------------------------------- /chainerrl/explorer.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from abc import ABCMeta 10 | from abc import abstractmethod 11 | from future.utils import with_metaclass 12 | 13 | 14 | class Explorer(with_metaclass(ABCMeta, object)): 15 | """Abstract explorer.""" 16 | 17 | @abstractmethod 18 | def select_action(self, t, greedy_action_func, action_value=None): 19 | """Select an action. 20 | 21 | Args: 22 | t: current time step 23 | greedy_action_func: function with no argument that returns an action 24 | action_value (ActionValue): ActionValue object 25 | """ 26 | raise NotImplementedError() 27 | -------------------------------------------------------------------------------- /chainerrl/functions/scale_grad.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | import numpy 8 | 9 | import chainer 10 | from chainer.utils import type_check 11 | 12 | 13 | class ScaleGrad(chainer.Function): 14 | 15 | def __init__(self, scale): 16 | self.scale = scale 17 | 18 | def check_type_forward(self, in_types): 19 | type_check.expect( 20 | in_types.size() == 1, 21 | in_types[0].dtype == numpy.float32 22 | ) 23 | 24 | def forward(self, x): 25 | return x 26 | 27 | def backward(self, x, gy): 28 | return tuple(g * self.scale for g in gy) 29 | 30 | 31 | def scale_grad(x, scale): 32 | return ScaleGrad(scale=scale)(x) 33 | -------------------------------------------------------------------------------- /chainerrl/misc/is_return_code_zero.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | 8 | import os 9 | import subprocess 10 | 11 | 12 | def is_return_code_zero(args): 13 | """Return true iff the given command's return code is zero. 14 | 15 | All the messages to stdout or stderr are suppressed. 16 | """ 17 | with open(os.devnull, 'wb') as FNULL: 18 | try: 19 | subprocess.check_call(args, stdout=FNULL, stderr=FNULL) 20 | except subprocess.CalledProcessError: 21 | # The given command returned an error 22 | return False 23 | except OSError: 24 | # The given command was not found 25 | return False 26 | return True 27 | -------------------------------------------------------------------------------- /chainerrl/wrappers/scale_reward.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import gym 10 | 11 | 12 | class ScaleReward(gym.RewardWrapper): 13 | """Scale reward by a scale factor. 14 | 15 | Args: 16 | env: Env to wrap. 17 | scale (float): Scale factor. 18 | 19 | Attributes: 20 | scale: Scale factor. 21 | original_reward: Reward before casting. 22 | """ 23 | 24 | def __init__(self, env, scale): 25 | super().__init__(env) 26 | self.scale = scale 27 | self.original_reward = None 28 | 29 | def _reward(self, reward): 30 | self.original_reward = reward 31 | return self.scale * reward 32 | -------------------------------------------------------------------------------- /examples/gym/README.md: -------------------------------------------------------------------------------- 1 | # Examples for OpenAI Gym environments 2 | 3 | - `train_a3c_gym.py`: A3C for both discrete action and continuous action spaces 4 | - `train_acer_gym.py`: DiscreteACER for discrete action spaces 5 | - `train_dqn_gym.py`: DQN for both discrete action and continuous action spaces 6 | - `train_ddpg_gym.py`: DDPG for continuous action spaces 7 | - `train_pcl_gym.py`: PCL for both discrete action and continuous action spaces 8 | - `train_reinforce_gym.py`: REINFORCE for both discrete action and continuous action spaces (only for episodic envs) 9 | 10 | ## How to run 11 | 12 | ``` 13 | python train_a3c_gym.py n_processes [options] 14 | python train_acer_gym.py n_processes [options] 15 | python train_dqn_gym.py [options] 16 | python train_ddpg_gym.py [options] 17 | python train_pcl_gym.py [options] 18 | python train_reinforce_gym.py [options] 19 | ``` 20 | 21 | Specify `--help` or read code for options. 22 | -------------------------------------------------------------------------------- /docs/agents.rst: -------------------------------------------------------------------------------- 1 | ====== 2 | Agents 3 | ====== 4 | 5 | Agent interfaces 6 | ================ 7 | 8 | .. autoclass:: chainerrl.agent.Agent 9 | :members: 10 | 11 | Agent implementations 12 | ===================== 13 | 14 | .. autoclass:: chainerrl.agents.A3C 15 | 16 | .. autoclass:: chainerrl.agents.ACER 17 | 18 | .. autoclass:: chainerrl.agents.AL 19 | 20 | .. autoclass:: chainerrl.agents.DDPG 21 | 22 | .. autoclass:: chainerrl.agents.DoubleDQN 23 | 24 | .. autoclass:: chainerrl.agents.DoublePAL 25 | 26 | .. autoclass:: chainerrl.agents.DPP 27 | 28 | .. autoclass:: chainerrl.agents.DQN 29 | 30 | .. autoclass:: chainerrl.agents.NSQ 31 | 32 | .. autoclass:: chainerrl.agents.PAL 33 | 34 | .. autoclass:: chainerrl.agents.PCL 35 | 36 | .. autoclass:: chainerrl.agents.PGT 37 | 38 | .. autoclass:: chainerrl.agents.REINFORCE 39 | 40 | .. autoclass:: chainerrl.agents.ResidualDQN 41 | 42 | .. autoclass:: chainerrl.agents.SARSA 43 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=ChainerRL 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. ChainerRL documentation master file, created by 2 | sphinx-quickstart on Tue Mar 14 22:25:44 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ================================================ 7 | ChainerRL, a deep reinforcement learning library 8 | ================================================ 9 | 10 | ChainerRL is a deep reinforcement learning library that implements various state-of-the-art deep reinforcement algorithms in Python using `Chainer `_, a flexible deep learning framework. 11 | 12 | .. toctree:: 13 | :maxdepth: 2 14 | 15 | install 16 | Quickstart Guide 17 | reference 18 | 19 | 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`modindex` 26 | * :ref:`search` 27 | -------------------------------------------------------------------------------- /chainerrl/wrappers/render.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import gym 10 | 11 | 12 | class Render(gym.Wrapper): 13 | """Render env by calling its render method. 14 | 15 | Args: 16 | env (gym.Env): Env to wrap. 17 | **kwargs: Keyword arguments passed to the render method. 18 | """ 19 | 20 | def __init__(self, env, **kwargs): 21 | super().__init__(env) 22 | self._kwargs = kwargs 23 | 24 | def reset(self, **kwargs): 25 | ret = self.env.reset(**kwargs) 26 | self.env.render(**self._kwargs) 27 | return ret 28 | 29 | def step(self, action): 30 | ret = self.env.step(action) 31 | self.env.render(**self._kwargs) 32 | return ret 33 | -------------------------------------------------------------------------------- /examples/grasping/README.md: -------------------------------------------------------------------------------- 1 | # Bullet-based robotic grasping 2 | 3 | This directory contains example scripts that learn to grasp objects in an environment simulated by Bullet, a physics simulator. 4 | 5 | ![Grasping](../../assets/grasping.gif) 6 | 7 | ## Files 8 | 9 | - `train_dqn_batch_grasping.py`: DoubleDQN + prioritized experience replay 10 | 11 | ## Requirements 12 | 13 | - pybullet>=2.1.2 14 | 15 | ## How to run 16 | 17 | Train with one simulator, which is slow. 18 | ``` 19 | python examples/pybullet/train_dqn_batch_grasping.py 20 | ``` 21 | 22 | Train with 96 simulators run in parallel, which is faster. 23 | ``` 24 | python examples/pybullet/train_dqn_batch_grasping.py --num-envs 96 25 | ``` 26 | 27 | Watch how the learned agent performs. `` must be a path to a directory where the agent was saved (e.g. `2000000_finish` created inside the output directory specified as `--outdir`). 28 | ``` 29 | python examples/pybullet/train_dqn_batch_grasping.py --demo --render --load 30 | ``` 31 | -------------------------------------------------------------------------------- /tests/experiments_tests/test_hooks.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | import unittest 9 | 10 | import numpy as np 11 | 12 | import chainerrl 13 | 14 | 15 | class TestLinearInterpolationHook(unittest.TestCase): 16 | 17 | def test_call(self): 18 | 19 | buf = [] 20 | 21 | def setter(env, agent, value): 22 | buf.append(value) 23 | 24 | hook = chainerrl.experiments.LinearInterpolationHook( 25 | total_steps=10, 26 | start_value=0.1, 27 | stop_value=1.0, 28 | setter=setter) 29 | 30 | for step in range(1, 10 + 1): 31 | hook(env=None, agent=None, step=step) 32 | 33 | np.testing.assert_allclose( 34 | buf, np.arange(1, 10 + 1, dtype=np.float32) / 10) 35 | -------------------------------------------------------------------------------- /chainerrl/explorers/additive_gaussian.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import numpy as np 10 | 11 | from chainerrl import explorer 12 | 13 | 14 | class AdditiveGaussian(explorer.Explorer): 15 | """Additive Gaussian noise to actions. 16 | 17 | Each action must be numpy.ndarray. 18 | 19 | Args: 20 | scale (float or array_like of floats): Scale parameter. 21 | """ 22 | 23 | def __init__(self, scale): 24 | self.scale = scale 25 | 26 | def select_action(self, t, greedy_action_func, action_value=None): 27 | a = greedy_action_func() 28 | noise = np.random.normal( 29 | scale=self.scale, size=a.shape).astype(np.float32) 30 | return a + noise 31 | 32 | def __repr__(self): 33 | return 'AdditiveGaussian(scale={})'.format(self.scale) 34 | -------------------------------------------------------------------------------- /chainerrl/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl.agents.a2c import A2C # NOQA 2 | from chainerrl.agents.a3c import A3C # NOQA 3 | from chainerrl.agents.acer import ACER # NOQA 4 | from chainerrl.agents.al import AL # NOQA 5 | from chainerrl.agents.categorical_dqn import CategoricalDQN # NOQA 6 | from chainerrl.agents.ddpg import DDPG # NOQA 7 | from chainerrl.agents.double_dqn import DoubleDQN # NOQA 8 | from chainerrl.agents.double_pal import DoublePAL # NOQA 9 | from chainerrl.agents.dpp import DPP # NOQA 10 | from chainerrl.agents.dqn import DQN # NOQA 11 | from chainerrl.agents.iqn import IQN # NOQA 12 | from chainerrl.agents.nsq import NSQ # NOQA 13 | from chainerrl.agents.pal import PAL # NOQA 14 | from chainerrl.agents.pcl import PCL # NOQA 15 | from chainerrl.agents.pgt import PGT # NOQA 16 | from chainerrl.agents.ppo import PPO # NOQA 17 | from chainerrl.agents.reinforce import REINFORCE # NOQA 18 | from chainerrl.agents.residual_dqn import ResidualDQN # NOQA 19 | from chainerrl.agents.sarsa import SARSA # NOQA 20 | from chainerrl.agents.trpo import TRPO # NOQA 21 | -------------------------------------------------------------------------------- /chainerrl/functions/bound_by_tanh.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import chainer 10 | from chainer import cuda 11 | from chainer import functions as F 12 | 13 | 14 | def bound_by_tanh(x, low, high): 15 | """Bound a given value into [low, high] by tanh. 16 | 17 | Args: 18 | x (chainer.Variable): value to bound 19 | low (numpy.ndarray): lower bound 20 | high (numpy.ndarray): upper bound 21 | Returns: chainer.Variable 22 | """ 23 | assert isinstance(x, chainer.Variable) 24 | assert low is not None 25 | assert high is not None 26 | xp = cuda.get_array_module(x.array) 27 | x_scale = (high - low) / 2 28 | x_scale = xp.expand_dims(xp.asarray(x_scale), axis=0) 29 | x_mean = (high + low) / 2 30 | x_mean = xp.expand_dims(xp.asarray(x_mean), axis=0) 31 | return F.tanh(x) * x_scale + x_mean 32 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | from setuptools import find_packages 3 | from setuptools import setup 4 | import sys 5 | 6 | install_requires = [ 7 | 'cached-property', 8 | 'chainer>=2.0.0', 9 | 'future', 10 | 'gym>=0.9.7', 11 | 'numpy>=1.10.4', 12 | 'pillow', 13 | 'scipy', 14 | ] 15 | 16 | test_requires = [ 17 | 'pytest', 18 | ] 19 | 20 | if sys.version_info < (3, 2): 21 | install_requires.append('fastcache') 22 | 23 | if sys.version_info < (3, 4): 24 | install_requires.append('statistics') 25 | 26 | if sys.version_info < (3, 5): 27 | install_requires.append('funcsigs') 28 | 29 | setup(name='chainerrl', 30 | version='0.5.0', 31 | description='ChainerRL, a deep reinforcement learning library', 32 | long_description=codecs.open('README.md', 'r', encoding='utf-8').read(), 33 | long_description_content_type='text/markdown', 34 | author='Yasuhiro Fujita', 35 | author_email='fujita@preferred.jp', 36 | license='MIT License', 37 | packages=find_packages(), 38 | install_requires=install_requires, 39 | test_requires=test_requires) 40 | -------------------------------------------------------------------------------- /chainerrl/policies/mellowmax_policy.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from logging import getLogger 10 | 11 | import chainer 12 | 13 | from chainerrl import distribution 14 | from chainerrl.policy import Policy 15 | 16 | 17 | logger = getLogger(__name__) 18 | 19 | 20 | class MellowmaxPolicy(chainer.Chain, Policy): 21 | """Mellowmax policy. 22 | 23 | See: http://arxiv.org/abs/1612.05628 24 | 25 | Args: 26 | model (chainer.Link): 27 | Link that is callable and outputs action values. 28 | omega (float): 29 | Parameter of the mellowmax function. 30 | """ 31 | 32 | def __init__(self, model, omega=1.): 33 | self.omega = omega 34 | super().__init__(model=model) 35 | 36 | def __call__(self, x): 37 | h = self.model(x) 38 | return distribution.MellowmaxDistribution(h, omega=self.omega) 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Preferred Networks, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /chainerrl/agents/double_dqn.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | 8 | import chainer 9 | 10 | from chainerrl.agents import dqn 11 | from chainerrl.recurrent import state_kept 12 | 13 | 14 | class DoubleDQN(dqn.DQN): 15 | """Double DQN. 16 | 17 | See: http://arxiv.org/abs/1509.06461. 18 | """ 19 | 20 | def _compute_target_values(self, exp_batch): 21 | 22 | batch_next_state = exp_batch['next_state'] 23 | 24 | with chainer.using_config('train', False), state_kept(self.q_function): 25 | next_qout = self.q_function(batch_next_state) 26 | 27 | target_next_qout = self.target_q_function(batch_next_state) 28 | 29 | next_q_max = target_next_qout.evaluate_actions( 30 | next_qout.greedy_actions) 31 | 32 | batch_rewards = exp_batch['reward'] 33 | batch_terminal = exp_batch['is_state_terminal'] 34 | discount = exp_batch['discount'] 35 | 36 | return batch_rewards + discount * (1.0 - batch_terminal) * next_q_max 37 | -------------------------------------------------------------------------------- /chainerrl/misc/init_like_torch.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | from chainer import links as L 8 | import numpy as np 9 | 10 | 11 | def init_like_torch(link): 12 | # Mimic torch's default parameter initialization 13 | # TODO(muupan): Use chainer's initializers when it is merged 14 | for l in link.links(): 15 | if isinstance(l, L.Linear): 16 | out_channels, in_channels = l.W.shape 17 | stdv = 1 / np.sqrt(in_channels) 18 | l.W.array[:] = np.random.uniform(-stdv, stdv, size=l.W.shape) 19 | if l.b is not None: 20 | l.b.array[:] = np.random.uniform(-stdv, stdv, size=l.b.shape) 21 | elif isinstance(l, L.Convolution2D): 22 | out_channels, in_channels, kh, kw = l.W.shape 23 | stdv = 1 / np.sqrt(in_channels * kh * kw) 24 | l.W.array[:] = np.random.uniform(-stdv, stdv, size=l.W.shape) 25 | if l.b is not None: 26 | l.b.array[:] = np.random.uniform(-stdv, stdv, size=l.b.shape) 27 | -------------------------------------------------------------------------------- /tests/explorers_tests/test_additive_ou.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | import unittest 8 | 9 | from chainer import testing 10 | import numpy as np 11 | 12 | from chainerrl.explorers.additive_ou import AdditiveOU 13 | 14 | 15 | @testing.parameterize(*testing.product({ 16 | 'action_size': [1, 3], 17 | 'sigma_type': ['scalar', 'ndarray'], 18 | })) 19 | class TestAdditiveOU(unittest.TestCase): 20 | 21 | def test(self): 22 | 23 | def greedy_action_func(): 24 | return np.asarray([0] * self.action_size, dtype=np.float32) 25 | 26 | if self.sigma_type == 'scalar': 27 | sigma = np.random.rand() 28 | elif self.sigma_type == 'ndarray': 29 | sigma = np.random.rand(self.action_size) 30 | theta = np.random.rand() 31 | 32 | explorer = AdditiveOU(theta=theta, sigma=sigma) 33 | 34 | print('theta:', theta, 'sigma', sigma) 35 | for t in range(100): 36 | a = explorer.select_action(t, greedy_action_func) 37 | print(t, a) 38 | -------------------------------------------------------------------------------- /chainerrl/misc/random.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import numpy as np 10 | 11 | 12 | def sample_n_k(n, k): 13 | """Sample k distinct elements uniformly from range(n)""" 14 | 15 | if not 0 <= k <= n: 16 | raise ValueError("Sample larger than population or is negative") 17 | if k == 0: 18 | return np.empty((0,), dtype=np.int64) 19 | elif 3 * k >= n: 20 | return np.random.choice(n, k, replace=False) 21 | else: 22 | result = np.random.choice(n, 2 * k) 23 | selected = set() 24 | selected_add = selected.add 25 | j = k 26 | for i in range(k): 27 | x = result[i] 28 | while x in selected: 29 | x = result[i] = result[j] 30 | j += 1 31 | if j == 2 * k: 32 | # This is slow, but it rarely happens. 33 | result[k:] = np.random.choice(n, k) 34 | j = k 35 | selected_add(x) 36 | return result[:k] 37 | -------------------------------------------------------------------------------- /chainerrl/explorers/boltzmann.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import chainer 10 | from chainer import functions as F 11 | import numpy as np 12 | 13 | import chainerrl 14 | 15 | 16 | class Boltzmann(chainerrl.explorer.Explorer): 17 | """Boltzmann exploration. 18 | 19 | Args: 20 | T (float): Temperature of Boltzmann distribution. 21 | """ 22 | 23 | def __init__(self, T=1.0): 24 | self.T = T 25 | 26 | def select_action(self, t, greedy_action_func, action_value=None): 27 | assert action_value is not None 28 | assert isinstance(action_value, 29 | chainerrl.action_value.DiscreteActionValue) 30 | n_actions = action_value.q_values.shape[1] 31 | with chainer.no_backprop_mode(): 32 | probs = chainer.cuda.to_cpu( 33 | F.softmax(action_value.q_values / self.T).array).ravel() 34 | return np.random.choice(np.arange(n_actions), p=probs) 35 | 36 | def __repr__(self): 37 | return 'Boltzmann(T={})'.format(self.T) 38 | -------------------------------------------------------------------------------- /chainerrl/wrappers/cast_observation.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import gym 10 | import numpy as np 11 | 12 | 13 | class CastObservation(gym.ObservationWrapper): 14 | """Cast observations to a given type. 15 | 16 | Args: 17 | env: Env to wrap. 18 | dtype: Data type object. 19 | 20 | Attributes: 21 | original_observation: Observation before casting. 22 | """ 23 | 24 | def __init__(self, env, dtype): 25 | super().__init__(env) 26 | self.dtype = dtype 27 | 28 | def _observation(self, observation): 29 | self.original_observation = observation 30 | return observation.astype(self.dtype, copy=False) 31 | 32 | 33 | class CastObservationToFloat32(CastObservation): 34 | """Cast observations to float32, which is common in Chainer. 35 | 36 | Args: 37 | env: Env to wrap. 38 | 39 | Attributes: 40 | original_observation: Observation before casting. 41 | """ 42 | 43 | def __init__(self, env): 44 | super().__init__(env, np.float32) 45 | -------------------------------------------------------------------------------- /tests/q_functions_tests/basetest_state_action_q_function.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import unittest 10 | 11 | import chainer 12 | import numpy as np 13 | 14 | 15 | class _TestSAQFunction(unittest.TestCase): 16 | 17 | def _test_call_given_model(self, model, gpu): 18 | # This method only check if a given model can receive random input 19 | # data and return output data with the correct interface. 20 | batch_size = 7 21 | obs = np.random.rand(batch_size, self.n_dim_obs).astype(np.float32) 22 | action = np.random.rand( 23 | batch_size, self.n_dim_action).astype(np.float32) 24 | if gpu >= 0: 25 | model.to_gpu(gpu) 26 | obs = chainer.cuda.to_gpu(obs) 27 | action = chainer.cuda.to_gpu(action) 28 | y = model(obs, action) 29 | self.assertTrue(isinstance(y, chainer.Variable)) 30 | self.assertEqual(y.shape, (batch_size, 1)) 31 | self.assertEqual(chainer.cuda.get_array_module(y), 32 | chainer.cuda.get_array_module(obs)) 33 | -------------------------------------------------------------------------------- /tests/agents_tests/test_ddpg.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import basetest_ddpg as base 10 | from chainerrl.agents.ddpg import DDPG 11 | 12 | 13 | class TestDDPGOnContinuousPOABC(base._TestDDPGOnContinuousPOABC): 14 | 15 | def make_ddpg_agent(self, env, model, actor_opt, critic_opt, explorer, 16 | rbuf, gpu): 17 | return DDPG(model, actor_opt, critic_opt, rbuf, gpu=gpu, gamma=0.9, 18 | explorer=explorer, replay_start_size=100, 19 | target_update_method='soft', target_update_interval=1, 20 | episodic_update=True, update_interval=1) 21 | 22 | 23 | class TestDDPGOnContinuousABC(base._TestDDPGOnContinuousABC): 24 | 25 | def make_ddpg_agent(self, env, model, actor_opt, critic_opt, explorer, 26 | rbuf, gpu): 27 | return DDPG(model, actor_opt, critic_opt, rbuf, gpu=gpu, gamma=0.9, 28 | explorer=explorer, replay_start_size=100, 29 | target_update_method='soft', target_update_interval=1, 30 | episodic_update=False) 31 | -------------------------------------------------------------------------------- /chainerrl/misc/reward_filter.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | 8 | 9 | class NormalizedRewardFilter(object): 10 | 11 | def __init__(self, tau=1e-3, scale=1, eps=1e-1): 12 | self.tau = tau 13 | self.scale = scale 14 | self.average_reward = 0 15 | self.average_reward_squared = 0 16 | self.eps = eps 17 | 18 | def __call__(self, reward): 19 | self.average_reward *= 1 - self.tau 20 | self.average_reward += self.tau * reward 21 | self.average_reward_squared *= 1 - self.tau 22 | self.average_reward_squared += self.tau * reward ** 2 23 | var = self.average_reward_squared - self.average_reward ** 2 24 | stdev = min(var, self.eps) ** 0.5 25 | return self.scale * (reward - self.average_reward) / stdev 26 | 27 | 28 | class AverageRewardFilter(object): 29 | 30 | def __init__(self, tau=1e-3): 31 | self.tau = tau 32 | self.average_reward = 0 33 | 34 | def __call__(self, reward): 35 | self.average_reward *= 1 - self.tau 36 | self.average_reward += self.tau * reward 37 | return reward - self.average_reward 38 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to ChainerRL 2 | 3 | Any kind of contribution to ChainerRL would be highly appreciated! 4 | 5 | Contribution examples: 6 | - Thumbing up to good issues or pull requests :+1: 7 | - Opening issues about questions, bugs, installation problems, feature requests, algorithm requests etc. 8 | - Sending pull requests 9 | 10 | If you could kindly send a PR to ChainerRL, please make sure all the tests successfully pass. 11 | 12 | ## Testing 13 | 14 | To test chainerrl modules, install and run `pytest`. Pass `-m "not gpu"` to skip tests that require gpu. E.g. 15 | ``` 16 | $ pip install pytest 17 | $ pytest -m "not gpu" 18 | ``` 19 | 20 | 21 | To test examples, run `test_examples.sh [gpu device id]`. `-1` would run examples with only cpu. 22 | 23 | ## Coding style 24 | 25 | We use PEP8. To check your code, use `autopep8` and `flake8` packages. 26 | ``` 27 | $ pip install autopep8 flake8 28 | $ autopep8 --diff path/to/your/code.py 29 | $ flake8 path/to/your/code.py 30 | ``` 31 | 32 | 33 | To use Python 3 features as much as possible while keeping Python 2 support, add the following lines to the head of each file. 34 | ``` 35 | from __future__ import print_function 36 | from __future__ import unicode_literals 37 | from __future__ import division 38 | from __future__ import absolute_import 39 | from builtins import * # NOQA 40 | from future import standard_library 41 | standard_library.install_aliases() 42 | ``` 43 | -------------------------------------------------------------------------------- /tests/links_tests/test_noisy_chain.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import chainer 4 | 5 | from chainerrl.links import to_factorized_noisy 6 | 7 | 8 | def names_of_links(link): 9 | return set([name for name, _ in link.namedlinks(skipself=True)]) 10 | 11 | 12 | class TestToFactorizedNoisy(unittest.TestCase): 13 | def test_chainlist(self): 14 | ch = chainer.ChainList( 15 | chainer.links.Linear(3, 4), 16 | chainer.links.Linear(5), 17 | chainer.links.PReLU(), 18 | ) 19 | self.assertEqual( 20 | names_of_links(ch), 21 | {'/0', '/1', '/2'}) 22 | 23 | to_factorized_noisy(ch) 24 | self.assertEqual( 25 | names_of_links(ch), 26 | { 27 | '/0', '/0/mu', '/0/sigma', 28 | '/1', '/1/mu', '/1/sigma', '/2'}) 29 | 30 | def test_chain(self): 31 | ch = chainer.Chain() 32 | with ch.init_scope(): 33 | ch.l1 = chainer.links.Linear(3, 4) 34 | ch.l2 = chainer.links.Linear(5) 35 | ch.l3 = chainer.links.PReLU() 36 | self.assertEqual( 37 | names_of_links(ch), 38 | {'/l1', '/l2', '/l3'}) 39 | 40 | to_factorized_noisy(ch) 41 | self.assertEqual( 42 | names_of_links(ch), 43 | { 44 | '/l1', '/l1/mu', '/l1/sigma', 45 | '/l2', '/l2/mu', '/l2/sigma', '/l3'}) 46 | -------------------------------------------------------------------------------- /tests/agents_tests/test_sarsa.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | from builtins import * # NOQA 7 | standard_library.install_aliases() # NOQA 8 | 9 | import basetest_dqn_like as base 10 | from chainerrl.agents import SARSA 11 | 12 | # Sarsa does not support batch training 13 | 14 | 15 | class TestSARSAOnDiscreteABC(base._TestDQNOnDiscreteABC): 16 | 17 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 18 | return SARSA( 19 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 20 | replay_start_size=100, target_update_interval=100) 21 | 22 | 23 | class TestSARSAOnContinuousABC(base._TestDQNOnContinuousABC): 24 | 25 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 26 | return SARSA( 27 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 28 | replay_start_size=100, target_update_interval=100) 29 | 30 | 31 | class TestSARSAOnDiscretePOABC(base._TestDQNOnDiscretePOABC): 32 | 33 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 34 | return SARSA( 35 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 36 | replay_start_size=100, target_update_interval=100, 37 | episodic_update=True) 38 | -------------------------------------------------------------------------------- /tests/wrappers_tests/test_scale_reward.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | 10 | import unittest 11 | 12 | from chainer import testing 13 | import gym 14 | 15 | import chainerrl 16 | 17 | 18 | @testing.parameterize(*testing.product({ 19 | 'env_id': ['CartPole-v1', 'MountainCar-v0'], 20 | 'scale': [1.0, 0.1] 21 | })) 22 | class TestScaleReward(unittest.TestCase): 23 | 24 | def test_scale_reward(self): 25 | env = chainerrl.wrappers.ScaleReward( 26 | gym.make(self.env_id), scale=self.scale) 27 | self.assertIsNone(env.original_reward) 28 | self.assertAlmostEqual(env.scale, self.scale) 29 | 30 | _ = env.reset() 31 | _, r, _, _ = env.step(env.action_space.sample()) 32 | 33 | if self.env_id == 'CartPole-v1': 34 | # Original reward must be 1 35 | self.assertAlmostEqual(env.original_reward, 1) 36 | self.assertAlmostEqual(r, self.scale) 37 | elif self.env_id == 'MountainCar-v0': 38 | # Original reward must be -1 39 | self.assertAlmostEqual(env.original_reward, -1) 40 | self.assertAlmostEqual(r, -self.scale) 41 | else: 42 | assert False 43 | -------------------------------------------------------------------------------- /tests/agents_tests/test_pgt.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import basetest_pgt as base 10 | from chainerrl.agents.pgt import PGT 11 | 12 | 13 | # Currently PGT does not support recurrent models 14 | # class TestPGTOnContinuousPOABC(base._TestPGTOnContinuousPOABC): 15 | # 16 | # def make_pgt_agent(self, env, model, actor_opt, critic_opt, explorer, 17 | # rbuf, gpu): 18 | # return PGT(model, actor_opt, critic_opt, rbuf, gpu=gpu, gamma=0.9, 19 | # explorer=explorer, replay_start_size=100, 20 | # target_update_method='soft', target_update_interval=1, 21 | # episodic_update=True, update_interval=1, 22 | # act_deterministically=True) 23 | 24 | 25 | class TestPGTOnContinuousABC(base._TestPGTOnContinuousABC): 26 | 27 | def make_pgt_agent(self, env, model, actor_opt, critic_opt, explorer, 28 | rbuf, gpu): 29 | return PGT(model, actor_opt, critic_opt, rbuf, gpu=gpu, gamma=0.9, 30 | explorer=explorer, replay_start_size=100, 31 | target_update_method='soft', target_update_interval=1, 32 | act_deterministically=True) 33 | -------------------------------------------------------------------------------- /chainerrl/functions/sum_arrays.py: -------------------------------------------------------------------------------- 1 | from chainer import cuda 2 | from chainer import function 3 | from chainer import utils 4 | from chainer.utils import type_check 5 | 6 | 7 | class SumArrays(function.Function): 8 | """Element-wise sum of input arrays.""" 9 | 10 | def check_type_forward(self, in_types): 11 | type_check.expect( 12 | in_types[0].dtype.kind == 'f', 13 | ) 14 | 15 | def forward_cpu(self, inputs): 16 | y = sum(inputs) 17 | return utils.force_array(y), 18 | 19 | def backward(self, inputs, grads): 20 | return [grads[0]] * len(inputs) 21 | 22 | def forward_gpu(self, inputs): 23 | n = len(inputs) 24 | ptrs = cuda.cupy.asarray([x.data.ptr for x in inputs], 25 | dtype=cuda.cupy.int64) 26 | y = cuda.elementwise( 27 | 'T x0, int64 xs, int32 n_xs', 28 | 'T y', 29 | 'float** xs_ = (float**) xs;' 30 | 'y = 0;' 31 | 'for (size_t j = 0; j < n_xs; ++j) {' 32 | ' y += xs_[j][i];' 33 | '}', 34 | 'sum_arrays'.format(n))(inputs[0], ptrs.data.ptr, len(ptrs)) 35 | return y, 36 | 37 | 38 | def sum_arrays(xs): 39 | """Element-wise sum of input arrays. 40 | 41 | Args: 42 | xs (tuple of ~chainer.Variable or ndarray): Input arrays to be summed. 43 | 44 | Returns: 45 | ~chainer.Variable: Output variable. 46 | """ 47 | return SumArrays()(*xs) 48 | -------------------------------------------------------------------------------- /chainerrl/agents/sarsa.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from chainerrl.agents import dqn 10 | 11 | 12 | class SARSA(dqn.DQN): 13 | """SARSA. 14 | 15 | Unlike DQN, this agent uses actions that have been actually taken to 16 | compute target Q values, thus is an on-policy algorithm. 17 | """ 18 | 19 | def _compute_target_values(self, exp_batch): 20 | 21 | batch_next_state = exp_batch['next_state'] 22 | batch_next_action = exp_batch['next_action'] 23 | 24 | next_target_action_value = self.target_q_function( 25 | batch_next_state) 26 | next_q = next_target_action_value.evaluate_actions( 27 | batch_next_action) 28 | 29 | batch_rewards = exp_batch['reward'] 30 | batch_terminal = exp_batch['is_state_terminal'] 31 | discount = exp_batch['discount'] 32 | 33 | return batch_rewards + discount * (1.0 - batch_terminal) * next_q 34 | 35 | def batch_act_and_train(self, batch_obs): 36 | raise NotImplementedError('SARSA does not support batch training') 37 | 38 | def batch_observe_and_train(self, batch_obs, batch_reward, 39 | batch_done, batch_reset): 40 | raise NotImplementedError('SARSA does not support batch training') 41 | -------------------------------------------------------------------------------- /chainerrl/env.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | from future.utils import with_metaclass 7 | standard_library.install_aliases() # NOQA 8 | 9 | from abc import ABCMeta 10 | from abc import abstractmethod 11 | 12 | 13 | class Env(with_metaclass(ABCMeta, object)): 14 | """RL learning environment. 15 | 16 | This serves a minimal interface for RL agents. 17 | """ 18 | 19 | @abstractmethod 20 | def step(self, action): 21 | raise NotImplementedError() 22 | 23 | @abstractmethod 24 | def reset(self): 25 | raise NotImplementedError() 26 | 27 | @abstractmethod 28 | def close(self): 29 | raise NotImplementedError() 30 | 31 | 32 | class VectorEnv(with_metaclass(ABCMeta, object)): 33 | """Parallel RL learning environments.""" 34 | 35 | @abstractmethod 36 | def step(self, action): 37 | raise NotImplementedError() 38 | 39 | @abstractmethod 40 | def reset(self, mask): 41 | """Reset envs. 42 | 43 | Args: 44 | mask (Sequence of bool): Mask array that specifies which env to 45 | skip. If omitted, all the envs are reset. 46 | """ 47 | raise NotImplementedError() 48 | 49 | @abstractmethod 50 | def seed(self, seeds): 51 | raise NotImplementedError() 52 | 53 | @abstractmethod 54 | def close(self): 55 | raise NotImplementedError() 56 | -------------------------------------------------------------------------------- /chainerrl/misc/conjugate_gradient.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import chainer 10 | 11 | 12 | def conjugate_gradient(A_product_func, b, tol=1e-10, max_iter=10): 13 | """Conjugate Gradient (CG) method. 14 | 15 | This function solves Ax=b for the vector x, where A is a real 16 | positive-definite matrix and b is a real vector. 17 | 18 | Args: 19 | A_product_func (callable): Callable that returns the product of the 20 | matrix A and a given vector. 21 | b (numpy.ndarray or cupy.ndarray): The vector b. 22 | tol (float): Tolerance parameter for early stopping. 23 | max_iter (int): Maximum number of iterations. 24 | 25 | Returns: 26 | numpy.ndarray or cupy.ndarray: The solution. 27 | The array module will be the same as the argument b's. 28 | """ 29 | xp = chainer.cuda.get_array_module(b) 30 | x = xp.zeros_like(b) 31 | r0 = b - A_product_func(x) 32 | p = r0 33 | for i in range(max_iter): 34 | a = xp.dot(r0, r0) / xp.dot(A_product_func(p), p) 35 | x = x + p * a 36 | r1 = r0 - A_product_func(p) * a 37 | if xp.linalg.norm(r1) < tol: 38 | return x 39 | b = xp.dot(r1, r1) / xp.dot(r0, r0) 40 | p = r1 + b * p 41 | r0 = r1 42 | return x 43 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | cache: pip 3 | python: 4 | - "2.7" 5 | - "3.5.1" 6 | env: 7 | - CHAINER_VERSION=3 8 | - CHAINER_VERSION=stable 9 | # command to install dependencies 10 | install: 11 | - pip install --upgrade pip setuptools wheel 12 | - | 13 | if [[ $CHAINER_VERSION == 3 ]]; then 14 | pip install "chainer==3.1.0" 15 | else 16 | pip install chainer 17 | fi 18 | - pip install pytest-cov 19 | - pip install -r requirements.txt --only-binary=numpy,scipy 20 | - pip install jupyter 21 | # gym 0.11.0 causes an error with Python 2 22 | - pip install "gym!=0.11.0" 23 | # atari_py==0.1.4 causes an error 24 | - pip install atari_py==0.1.1 25 | - pip install autopep8 26 | - pip install flake8 27 | - pip install coveralls 28 | - pip install opencv-python 29 | - pip install pybullet 30 | - python setup.py develop 31 | - python -c "import numpy; numpy.show_config()" 32 | before_script: 33 | - "export DISPLAY=:99.0" 34 | - sh -e /etc/init.d/xvfb start 35 | - sleep 3 36 | # command to run tests 37 | script: 38 | - flake8 chainerrl 39 | - flake8 tests 40 | - flake8 examples 41 | - autopep8 -r chainerrl tests examples --diff | tee check_autopep8 42 | - test ! -s check_autopep8 43 | - pytest -m "not gpu and not slow" -x tests --cov=chainerrl 44 | - ./test_examples.sh -1 45 | - if [[ $TRAVIS_PYTHON_VERSION == 3.5.1 && $CHAINER_VERSION == stable ]]; then jupyter nbconvert --to notebook --execute examples/quickstart/quickstart.ipynb --ExecutePreprocessor.timeout=600; fi 46 | after_success: 47 | - coveralls 48 | -------------------------------------------------------------------------------- /chainerrl/optimizers/nonbias_weight_decay.py: -------------------------------------------------------------------------------- 1 | # This caused an error in py2 because cupy expect non-unicode str 2 | # from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import division 5 | from __future__ import absolute_import 6 | from builtins import * # NOQA 7 | from future import standard_library 8 | standard_library.install_aliases() # NOQA 9 | from chainer import cuda 10 | 11 | 12 | class NonbiasWeightDecay(object): 13 | 14 | """Weight decay only for non-bias parameters. 15 | 16 | This hook can be used just like chainer.optimizer_hooks.WeightDecay except 17 | that this hook does not apply weight decay to bias parameters. 18 | 19 | This hook assumes that all the bias parameters have the name of "b". Any 20 | parameter whose name is "b" is considered as a bias and excluded from 21 | weight decay. 22 | """ 23 | name = 'NonbiasWeightDecay' 24 | call_for_each_param = True 25 | timing = 'pre' 26 | 27 | def __init__(self, rate): 28 | self.rate = rate 29 | 30 | def __call__(self, rule, param): 31 | if param.name == 'b': 32 | return 33 | p, g = param.array, param.grad 34 | if p is None or g is None: 35 | return 36 | with cuda.get_device_from_array(p) as dev: 37 | if int(dev) == -1: 38 | g += self.rate * p 39 | else: 40 | kernel = cuda.elementwise( 41 | 'T p, T decay', 'T g', 'g += decay * p', 'weight_decay') 42 | kernel(p, self.rate, g) 43 | -------------------------------------------------------------------------------- /chainerrl/misc/random_seed.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import os 10 | import random 11 | 12 | import chainer 13 | import numpy as np 14 | 15 | 16 | def set_random_seed(seed, gpus=()): 17 | """Set a given random seed to ChainerRL's random sources. 18 | 19 | This function sets a given random seed to random sources that ChainerRL 20 | depends on so that ChainerRL can be deterministic. It is not responsible 21 | for setting a random seed to environments ChainerRL is applied to. 22 | 23 | Note that there's no guaranteed way to make all the computations done by 24 | Chainer deterministic. See https://github.com/chainer/chainer/issues/4134. 25 | 26 | Args: 27 | seed (int): Random seed [0, 2 ** 32). 28 | gpus (tuple of ints): GPU device IDs to use. Negative values are 29 | ignored. 30 | """ 31 | # ChainerRL depends on random 32 | random.seed(seed) 33 | # ChainerRL depends on numpy.random 34 | np.random.seed(seed) 35 | # ChainerRL depends on cupy.random for GPU computation 36 | for gpu in gpus: 37 | if gpu >= 0: 38 | with chainer.cuda.get_device_from_id(gpu): 39 | chainer.cuda.cupy.random.seed(seed) 40 | # chainer.functions.n_step_rnn directly depends on CHAINER_SEED 41 | os.environ['CHAINER_SEED'] = str(seed) 42 | -------------------------------------------------------------------------------- /tests/agents_tests/test_al.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | from builtins import * # NOQA 7 | standard_library.install_aliases() # NOQA 8 | 9 | import basetest_dqn_like as base 10 | from basetest_training import _TestBatchTrainingMixin 11 | from chainerrl.agents.al import AL 12 | 13 | 14 | class TestALOnDiscreteABC( 15 | _TestBatchTrainingMixin, 16 | base._TestDQNOnDiscreteABC): 17 | 18 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 19 | return AL( 20 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 21 | replay_start_size=100, target_update_interval=100) 22 | 23 | 24 | class TestALOnContinuousABC( 25 | _TestBatchTrainingMixin, 26 | base._TestDQNOnContinuousABC): 27 | 28 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 29 | return AL( 30 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 31 | replay_start_size=100, target_update_interval=100) 32 | 33 | 34 | # Batch training with recurrent models is currently not supported 35 | class TestALOnDiscretePOABC(base._TestDQNOnDiscretePOABC): 36 | 37 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 38 | return AL( 39 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 40 | replay_start_size=100, target_update_interval=100, 41 | episodic_update=True) 42 | -------------------------------------------------------------------------------- /tests/agents_tests/test_pal.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import basetest_dqn_like as base 10 | from basetest_training import _TestBatchTrainingMixin 11 | from chainerrl.agents.pal import PAL 12 | 13 | 14 | class TestPALOnDiscreteABC( 15 | _TestBatchTrainingMixin, 16 | base._TestDQNOnDiscreteABC): 17 | 18 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 19 | return PAL( 20 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 21 | replay_start_size=100, target_update_interval=100) 22 | 23 | 24 | class TestPALOnContinuousABC( 25 | _TestBatchTrainingMixin, 26 | base._TestDQNOnContinuousABC): 27 | 28 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 29 | return PAL( 30 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 31 | replay_start_size=100, target_update_interval=100) 32 | 33 | 34 | # Batch training with recurrent models is currently not supported 35 | class TestPALOnDiscretePOABC(base._TestDQNOnDiscretePOABC): 36 | 37 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 38 | return PAL( 39 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 40 | replay_start_size=100, target_update_interval=100, 41 | episodic_update=True) 42 | -------------------------------------------------------------------------------- /tests/agents_tests/test_double_pal.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | from builtins import * # NOQA 7 | standard_library.install_aliases() # NOQA 8 | 9 | from chainerrl.agents.double_pal import DoublePAL 10 | 11 | import basetest_dqn_like 12 | from basetest_training import _TestBatchTrainingMixin 13 | 14 | 15 | class TestDoublePALOnDiscreteABC( 16 | _TestBatchTrainingMixin, 17 | basetest_dqn_like._TestDQNOnDiscreteABC): 18 | 19 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 20 | return DoublePAL( 21 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 22 | replay_start_size=100, target_update_interval=100) 23 | 24 | 25 | class TestDoublePALOnContinuousABC( 26 | _TestBatchTrainingMixin, 27 | basetest_dqn_like._TestDQNOnContinuousABC): 28 | 29 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 30 | return DoublePAL( 31 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 32 | replay_start_size=100, target_update_interval=100) 33 | 34 | 35 | class TestDoublePALOnDiscretePOABC(basetest_dqn_like._TestDQNOnDiscretePOABC): 36 | 37 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 38 | return DoublePAL( 39 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 40 | replay_start_size=100, target_update_interval=100, 41 | episodic_update=True) 42 | -------------------------------------------------------------------------------- /chainerrl/v_functions/v_functions.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import chainer 10 | from chainer import functions as F 11 | 12 | from chainerrl.links.mlp import MLP 13 | from chainerrl.recurrent import RecurrentChainMixin 14 | from chainerrl.v_function import VFunction 15 | 16 | 17 | class SingleModelVFunction( 18 | chainer.Chain, VFunction, RecurrentChainMixin): 19 | """V-function 20 | 21 | Args: 22 | model (chainer.Link): 23 | Link that is callable, inputs states, and outputs state values. 24 | """ 25 | 26 | def __init__(self, model): 27 | super().__init__(model=model) 28 | 29 | def __call__(self, x): 30 | h = self.model(x) 31 | return h 32 | 33 | 34 | class FCVFunction(SingleModelVFunction): 35 | 36 | def __init__(self, n_input_channels, n_hidden_layers=0, 37 | n_hidden_channels=None, nonlinearity=F.relu, 38 | last_wscale=1): 39 | self.n_input_channels = n_input_channels 40 | self.n_hidden_layers = n_hidden_layers 41 | self.n_hidden_channels = n_hidden_channels 42 | 43 | super().__init__( 44 | model=MLP(self.n_input_channels, 1, 45 | [self.n_hidden_channels] * self.n_hidden_layers, 46 | nonlinearity=nonlinearity, 47 | last_wscale=last_wscale), 48 | ) 49 | -------------------------------------------------------------------------------- /tests/wrappers_tests/test_continuing_time_limit.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import mock 10 | import unittest 11 | 12 | from chainer import testing 13 | 14 | import chainerrl 15 | 16 | 17 | @testing.parameterize(*testing.product({ 18 | 'max_episode_steps': [1, 2, 3], 19 | })) 20 | class TestContinuingTimeLimit(unittest.TestCase): 21 | 22 | def test(self): 23 | env = mock.Mock() 24 | env.reset.side_effect = ['state'] * 2 25 | # Since info dicts are modified by the wapper, each step call needs to 26 | # return a new info dict. 27 | env.step.side_effect = [('state', 0, False, {}) for _ in range(6)] 28 | env = chainerrl.wrappers.ContinuingTimeLimit( 29 | env, max_episode_steps=self.max_episode_steps) 30 | 31 | env.reset() 32 | for t in range(2): 33 | _, _, done, info = env.step(0) 34 | if t + 1 >= self.max_episode_steps: 35 | self.assertTrue(info['needs_reset']) 36 | else: 37 | self.assertFalse(info.get('needs_reset', False)) 38 | 39 | env.reset() 40 | for t in range(4): 41 | _, _, done, info = env.step(0) 42 | if t + 1 >= self.max_episode_steps: 43 | self.assertTrue(info['needs_reset']) 44 | else: 45 | self.assertFalse(info.get('needs_reset', False)) 46 | -------------------------------------------------------------------------------- /tests/misc_tests/test_conjugate_gradient.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | import unittest 8 | 9 | import chainer 10 | from chainer import testing 11 | from chainer.testing import condition 12 | import numpy as np 13 | 14 | import chainerrl 15 | 16 | 17 | @testing.parameterize( 18 | *testing.product({ 19 | 'n': [1, 5], 20 | 'dtype': [np.float64, np.float32], 21 | }) 22 | ) 23 | class TestConjugateGradient(unittest.TestCase): 24 | 25 | def _test(self, xp): 26 | # A must be symmetric and positive-definite 27 | random_mat = xp.random.normal(size=(self.n, self.n)).astype(self.dtype) 28 | A = random_mat.dot(random_mat.T) 29 | x_ans = xp.random.normal(size=self.n).astype(self.dtype) 30 | b = A.dot(x_ans) 31 | 32 | def A_product_func(vec): 33 | self.assertEqual(xp, chainer.cuda.get_array_module(vec)) 34 | self.assertEqual(vec.shape, b.shape) 35 | return A.dot(vec) 36 | 37 | x = chainerrl.misc.conjugate_gradient(A_product_func, b) 38 | self.assertEqual(x.dtype, self.dtype) 39 | self.assertTrue(chainer.cuda.get_array_module(x), xp) 40 | xp.testing.assert_allclose(x, x_ans, rtol=1e-3) 41 | 42 | @condition.retry(3) 43 | def test_cpu(self): 44 | self._test(np) 45 | 46 | @testing.attr.gpu 47 | @condition.retry(3) 48 | def test_gpu(self): 49 | self._test(chainer.cuda.cupy) 50 | -------------------------------------------------------------------------------- /tests/agents_tests/test_double_dqn.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import basetest_dqn_like 10 | from basetest_training import _TestBatchTrainingMixin 11 | from chainerrl.agents.double_dqn import DoubleDQN 12 | 13 | 14 | class TestDoubleDQNOnDiscreteABC( 15 | _TestBatchTrainingMixin, 16 | basetest_dqn_like._TestDQNOnDiscreteABC): 17 | 18 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 19 | return DoubleDQN( 20 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 21 | replay_start_size=100, target_update_interval=100) 22 | 23 | 24 | class TestDoubleDQNOnContinuousABC( 25 | _TestBatchTrainingMixin, 26 | basetest_dqn_like._TestDQNOnContinuousABC): 27 | 28 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 29 | return DoubleDQN( 30 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 31 | replay_start_size=100, target_update_interval=100) 32 | 33 | 34 | # Batch training with recurrent models is currently not supported 35 | class TestDoubleDQNOnDiscretePOABC(basetest_dqn_like._TestDQNOnDiscretePOABC): 36 | 37 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 38 | return DoubleDQN( 39 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 40 | replay_start_size=100, target_update_interval=100, 41 | episodic_update=True) 42 | -------------------------------------------------------------------------------- /tests/misc_tests/test_batch_states.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | import unittest 8 | 9 | import chainer 10 | from chainer import testing 11 | import numpy as np 12 | 13 | import chainerrl 14 | 15 | 16 | class TestBatchStates(unittest.TestCase): 17 | 18 | def _test(self, xp): 19 | 20 | # state: ((2,2)-shaped array, integer, (1,)-shaped array) 21 | states = [ 22 | (np.arange(4).reshape((2, 2)), 0, np.zeros(1)), 23 | (np.arange(4).reshape((2, 2)) + 1, 1, np.zeros(1) + 1), 24 | ] 25 | 26 | def phi(state): 27 | return state[0] * 2, state[1], state[2] * 3 28 | 29 | batch = chainerrl.misc.batch_states(states, xp=xp, phi=phi) 30 | self.assertIsInstance(batch, tuple) 31 | batch_a, batch_b, batch_c = batch 32 | xp.testing.assert_allclose( 33 | batch_a, 34 | xp.asarray([ 35 | [[0, 2], 36 | [4, 6]], 37 | [[2, 4], 38 | [6, 8]], 39 | ]) 40 | ) 41 | xp.testing.assert_allclose( 42 | batch_b, 43 | xp.asarray([0, 1]) 44 | ) 45 | xp.testing.assert_allclose( 46 | batch_c, 47 | xp.asarray([ 48 | [0], 49 | [3], 50 | ]) 51 | ) 52 | 53 | def test_cpu(self): 54 | self._test(np) 55 | 56 | @testing.attr.gpu 57 | def test_gpu(self): 58 | self._test(chainer.cuda.cupy) 59 | -------------------------------------------------------------------------------- /tests/agents_tests/basetest_agents.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | import os 8 | import tempfile 9 | import unittest 10 | 11 | from chainer import testing 12 | 13 | from chainerrl.envs.abc import ABC 14 | from chainerrl.experiments.train_agent import train_agent 15 | 16 | 17 | class _TestAgentInterface(unittest.TestCase): 18 | 19 | def setUp(self): 20 | self.env = ABC(discrete=self.discrete, 21 | partially_observable=self.partially_observable, 22 | episodic=self.episodic) 23 | 24 | def create_agent(self, env): 25 | raise NotImplementedError() 26 | 27 | def test_save_load(self): 28 | a = self.create_agent(self.env) 29 | dirname = tempfile.mkdtemp() 30 | a.save(dirname) 31 | self.assertTrue(os.path.exists(dirname)) 32 | b = self.create_agent(self.env) 33 | b.load(dirname) 34 | 35 | def test_run_episode(self): 36 | agent = self.create_agent(self.env) 37 | done = False 38 | obs = self.env.reset() 39 | t = 0 40 | while t < 10 and not done: 41 | a = agent.act(obs) 42 | obs, r, done, info = self.env.step(a) 43 | t += 1 44 | 45 | @testing.attr.slow 46 | def test_train(self): 47 | agent = self.create_agent(self.env) 48 | train_agent( 49 | agent=agent, 50 | env=self.env, 51 | steps=2000, 52 | outdir=tempfile.mkdtemp(), 53 | max_episode_len=10) 54 | -------------------------------------------------------------------------------- /tests/agents_tests/test_residual_dqn.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | 8 | import basetest_dqn_like as base 9 | from basetest_training import _TestBatchTrainingMixin 10 | from chainerrl.agents.residual_dqn import ResidualDQN 11 | 12 | 13 | class TestResidualDQNOnDiscreteABC( 14 | _TestBatchTrainingMixin, 15 | base._TestDQNOnDiscreteABC): 16 | 17 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 18 | return ResidualDQN( 19 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 20 | replay_start_size=100, target_update_interval=100, 21 | grad_scale=1e-1) 22 | 23 | 24 | class TestResidualDQNOnContinuousABC( 25 | _TestBatchTrainingMixin, 26 | base._TestDQNOnContinuousABC): 27 | 28 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 29 | return ResidualDQN( 30 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 31 | replay_start_size=100, target_update_interval=100, 32 | grad_scale=1e-1) 33 | 34 | 35 | # Batch training with recurrent models is currently not supported 36 | class TestResidualDQNOnDiscretePOABC(base._TestDQNOnDiscretePOABC): 37 | 38 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 39 | return ResidualDQN( 40 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 41 | replay_start_size=100, target_update_interval=100, 42 | episodic_update=True, 43 | grad_scale=1e-1) 44 | -------------------------------------------------------------------------------- /tools/plot_scores.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() 7 | import argparse 8 | import os 9 | 10 | import matplotlib 11 | matplotlib.use('Agg') # Needed to run without X-server 12 | import matplotlib.pyplot as plt 13 | import pandas as pd 14 | 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--title', type=str, default='') 19 | parser.add_argument('--file', action='append', dest='files', 20 | default=[], type=str, 21 | help='specify paths of scores.txt') 22 | parser.add_argument('--label', action='append', dest='labels', 23 | default=[], type=str, 24 | help='specify labels for scores.txt files') 25 | args = parser.parse_args() 26 | 27 | assert len(args.files) > 0 28 | assert len(args.labels) == len(args.files) 29 | 30 | for fpath, label in zip(args.files, args.labels): 31 | if os.path.isdir(fpath): 32 | fpath = os.path.join(fpath, 'scores.txt') 33 | assert os.path.exists(fpath) 34 | scores = pd.read_csv(fpath, delimiter='\t') 35 | plt.plot(scores['steps'], scores['mean'], label=label) 36 | 37 | plt.xlabel('steps') 38 | plt.ylabel('score') 39 | plt.legend(loc='best') 40 | if args.title: 41 | plt.title(args.title) 42 | 43 | fig_fname = args.files[0] + args.title + '.png' 44 | plt.savefig(fig_fname) 45 | print('Saved a figure as {}'.format(fig_fname)) 46 | 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /chainerrl/links/mlp.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import chainer 10 | from chainer import functions as F 11 | from chainer import links as L 12 | 13 | from chainerrl.initializers import LeCunNormal 14 | 15 | 16 | class MLP(chainer.Chain): 17 | """Multi-Layer Perceptron""" 18 | 19 | def __init__(self, in_size, out_size, hidden_sizes, nonlinearity=F.relu, 20 | last_wscale=1): 21 | self.in_size = in_size 22 | self.out_size = out_size 23 | self.hidden_sizes = hidden_sizes 24 | self.nonlinearity = nonlinearity 25 | 26 | super().__init__() 27 | with self.init_scope(): 28 | if hidden_sizes: 29 | hidden_layers = [] 30 | hidden_layers.append(L.Linear(in_size, hidden_sizes[0])) 31 | for hin, hout in zip(hidden_sizes, hidden_sizes[1:]): 32 | hidden_layers.append(L.Linear(hin, hout)) 33 | self.hidden_layers = chainer.ChainList(*hidden_layers) 34 | self.output = L.Linear(hidden_sizes[-1], out_size, 35 | initialW=LeCunNormal(last_wscale)) 36 | else: 37 | self.output = L.Linear(in_size, out_size, 38 | initialW=LeCunNormal(last_wscale)) 39 | 40 | def __call__(self, x): 41 | h = x 42 | if self.hidden_sizes: 43 | for l in self.hidden_layers: 44 | h = self.nonlinearity(l(h)) 45 | return self.output(h) 46 | -------------------------------------------------------------------------------- /chainerrl/envs/serial_vector_env.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import numpy as np 10 | 11 | import chainerrl 12 | 13 | 14 | class SerialVectorEnv(chainerrl.env.VectorEnv): 15 | """VectorEnv where each env is run sequentially. 16 | 17 | The purpose of this VectorEnv is to help debugging. For speed, you should 18 | use MultiprocessVectorEnv if possible. 19 | 20 | Args: 21 | env_fns (list of gym.Env): List of gym.Env. 22 | """ 23 | 24 | def __init__(self, envs): 25 | self.envs = envs 26 | self.last_obs = [None] * self.num_envs 27 | self.action_space = envs[0].action_space 28 | self.observation_space = envs[0].observation_space 29 | self.spec = envs[0].observation_space 30 | 31 | def step(self, actions): 32 | results = [env.step(a) for env, a in zip(self.envs, actions)] 33 | self.last_obs, rews, dones, infos = zip(*results) 34 | return self.last_obs, rews, dones, infos 35 | 36 | def reset(self, mask=None): 37 | if mask is None: 38 | mask = np.zeros(self.num_envs) 39 | obs = [env.reset() if not m else o 40 | for m, env, o in zip(mask, self.envs, self.last_obs)] 41 | self.last_obs = obs 42 | return obs 43 | 44 | def seed(self, seeds): 45 | for env, seed in zip(self.envs, seeds): 46 | env.seed(seed) 47 | 48 | def close(self): 49 | for env in self.envs: 50 | env.close() 51 | 52 | @property 53 | def num_envs(self): 54 | return len(self.envs) 55 | -------------------------------------------------------------------------------- /chainerrl/wrappers/continuing_time_limit.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import gym 10 | 11 | 12 | class ContinuingTimeLimit(gym.Wrapper): 13 | """TimeLimit wrapper for continuing environments. 14 | 15 | This is similar gym.wrappers.TimeLimit, which sets a time limit for 16 | each episode, except that done=False is returned and that 17 | info['needs_reset'] is set to True when past the limit. 18 | 19 | Code that calls env.step is responsible for checking the info dict, the 20 | fourth returned value, and resetting the env if it has the 'needs_reset' 21 | key and its value is True. 22 | 23 | Args: 24 | env (gym.Env): Env to wrap. 25 | max_episode_steps (int): Maximum number of timesteps during an episode, 26 | after which the env needs a reset. 27 | """ 28 | 29 | def __init__(self, env, max_episode_steps): 30 | super(ContinuingTimeLimit, self).__init__(env) 31 | self._max_episode_steps = max_episode_steps 32 | 33 | self._elapsed_steps = None 34 | 35 | def step(self, action): 36 | assert self._elapsed_steps is not None,\ 37 | "Cannot call env.step() before calling reset()" 38 | observation, reward, done, info = self.env.step(action) 39 | self._elapsed_steps += 1 40 | 41 | if self._max_episode_steps <= self._elapsed_steps: 42 | info['needs_reset'] = True 43 | 44 | return observation, reward, done, info 45 | 46 | def reset(self): 47 | self._elapsed_steps = 0 48 | return self.env.reset() 49 | -------------------------------------------------------------------------------- /chainerrl/wrappers/randomize_action.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import gym 10 | import numpy as np 11 | 12 | 13 | class RandomizeAction(gym.ActionWrapper): 14 | """Apply a random action instead of the one sent by the agent. 15 | 16 | This wrapper can be used to make a stochastic env. The common use is 17 | for evaluation in Atari environments, where actions are replaced with 18 | random ones with a low probability. 19 | 20 | Only gym.spaces.Discrete is supported as an action space. 21 | 22 | For exploration during training, use explorers like 23 | chainerrl.explorers.ConstantEpsilonGreedy instead of this wrapper. 24 | 25 | Args: 26 | env (gym.Env): Env to wrap. 27 | random_fraction (float): Fraction of actions that will be replaced 28 | with a random action. It must be in [0, 1]. 29 | """ 30 | 31 | def __init__(self, env, random_fraction): 32 | super().__init__(env) 33 | assert 0 <= random_fraction <= 1 34 | assert isinstance(env.action_space, gym.spaces.Discrete),\ 35 | 'RandomizeAction supports only gym.spaces.Discrete as an action space' # NOQA 36 | self._random_fraction = random_fraction 37 | self._np_random = np.random.RandomState() 38 | 39 | def _action(self, action): 40 | if self._np_random.rand() < self._random_fraction: 41 | return self._np_random.randint(self.env.action_space.n) 42 | else: 43 | return action 44 | 45 | def seed(self, seed): 46 | super().seed(seed) 47 | self._np_random.seed(seed) 48 | -------------------------------------------------------------------------------- /tests/optimizer_tests/test_nonbias_weight_decay.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import unittest 10 | 11 | import chainer 12 | import chainer.links as L 13 | from chainer import testing 14 | import numpy as np 15 | 16 | import chainerrl 17 | 18 | 19 | @testing.parameterize(*testing.product( 20 | { 21 | 'lr': [1.0, 0.1], 22 | 'weight_decay_rate': [0.1, 0.05] 23 | } 24 | )) 25 | class TestNonbiasWeightDecay(unittest.TestCase): 26 | 27 | def _test(self, gpu): 28 | 29 | model = chainer.Chain( 30 | a=L.Linear(1, 2, initialW=3, initial_bias=3), 31 | b=chainer.Chain(c=L.Linear(2, 3, initialW=4, initial_bias=4)), 32 | ) 33 | if gpu >= 0: 34 | model.to_gpu(gpu) 35 | xp = model.xp 36 | else: 37 | xp = np 38 | optimizer = chainer.optimizers.SGD(self.lr) 39 | optimizer.setup(model) 40 | optimizer.add_hook( 41 | chainerrl.optimizers.NonbiasWeightDecay( 42 | rate=self.weight_decay_rate)) 43 | optimizer.update(lambda: chainer.Variable(xp.asarray(0.0))) 44 | decay_factor = 1 - self.lr * self.weight_decay_rate 45 | xp.testing.assert_allclose(model.a.W.array, 3 * decay_factor) 46 | xp.testing.assert_allclose(model.a.b.array, 3) 47 | xp.testing.assert_allclose(model.b.c.W.array, 4 * decay_factor) 48 | xp.testing.assert_allclose(model.b.c.b.array, 4) 49 | 50 | def test_cpu(self): 51 | self._test(gpu=-1) 52 | 53 | @testing.attr.gpu 54 | def test_gpu(self): 55 | self._test(gpu=0) 56 | -------------------------------------------------------------------------------- /chainerrl/links/noisy_chain.py: -------------------------------------------------------------------------------- 1 | """Noisy Networks 2 | 3 | See http://arxiv.org/abs/1706.10295 4 | """ 5 | 6 | import chainer 7 | from chainer.links import Linear 8 | 9 | from chainerrl.links.noisy_linear import FactorizedNoisyLinear 10 | from chainerrl.links.sequence import Sequence 11 | 12 | 13 | def to_factorized_noisy(link, *args, **kwargs): 14 | """Add noisiness to components of given link 15 | 16 | Currently this function supports L.Linear (with and without bias) 17 | """ 18 | 19 | def func_to_factorized_noisy(link): 20 | if isinstance(link, Linear): 21 | return FactorizedNoisyLinear(link, *args, **kwargs) 22 | else: 23 | return link 24 | 25 | _map_links(func_to_factorized_noisy, link) 26 | 27 | 28 | def _map_links(func, link): 29 | if isinstance(link, chainer.Chain): 30 | children_names = link._children.copy() 31 | for name in children_names: 32 | child = getattr(link, name) 33 | new_child = func(child) 34 | if new_child is child: 35 | _map_links(func, child) 36 | else: 37 | delattr(link, name) 38 | with link.init_scope(): 39 | setattr(link, name, new_child) 40 | elif isinstance(link, chainer.ChainList): 41 | children = link._children 42 | for i in range(len(children)): 43 | child = children[i] 44 | new_child = func(child) 45 | if new_child is child: 46 | _map_links(func, child) 47 | else: 48 | # mimic ChainList.add_link 49 | children[i] = new_child 50 | children[i].name = str(i) 51 | 52 | if isinstance(link, Sequence): 53 | # assumes i-th layer corresponds with i-th child 54 | link.layers[i] = new_child 55 | -------------------------------------------------------------------------------- /tests/explorers_tests/test_boltzmann.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | 8 | import unittest 9 | 10 | import chainer 11 | import numpy as np 12 | 13 | import chainerrl 14 | 15 | 16 | def count_actions_selected_by_boltzmann(T, q_values): 17 | 18 | def greedy_action_func(): 19 | raise RuntimeError('Must not be called') 20 | 21 | explorer = chainerrl.explorers.Boltzmann(T=T) 22 | action_value = chainerrl.action_value.DiscreteActionValue(q_values) 23 | 24 | action_count = [0] * 3 25 | 26 | for t in range(10000): 27 | a = explorer.select_action(t, greedy_action_func, action_value) 28 | action_count[a] += 1 29 | 30 | return action_count 31 | 32 | 33 | class TestBoltzmann(unittest.TestCase): 34 | 35 | def test_boltzmann(self): 36 | 37 | # T=1 38 | q_values = chainer.Variable(np.asarray([[-1, 1, 0]], dtype=np.float32)) 39 | action_count = count_actions_selected_by_boltzmann(1, q_values) 40 | print('T=1', action_count) 41 | # Actions with larger values must be selected more often 42 | self.assertGreater(action_count[1], action_count[2]) 43 | self.assertGreater(action_count[2], action_count[0]) 44 | 45 | # T=0.5 46 | action_count_t05 = count_actions_selected_by_boltzmann(0.5, q_values) 47 | print('T=0.5', action_count_t05) 48 | # Actions with larger values must be selected more often 49 | self.assertGreater(action_count_t05[1], action_count_t05[2]) 50 | self.assertGreater(action_count_t05[2], action_count_t05[0]) 51 | 52 | # T=0.5 must be more greedy than T=1 53 | self.assertGreater(action_count_t05[1], action_count[1]) 54 | -------------------------------------------------------------------------------- /tests/misc_tests/test_random_seed.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | from builtins import * # NOQA 7 | standard_library.install_aliases() # NOQA 8 | 9 | import random 10 | import unittest 11 | 12 | import chainer 13 | from chainer.testing import attr 14 | import numpy as np 15 | 16 | import chainerrl 17 | 18 | 19 | class TestSetRandomSeed(unittest.TestCase): 20 | 21 | def test_random(self): 22 | chainerrl.misc.set_random_seed(0) 23 | seed0_0 = random.random() 24 | chainerrl.misc.set_random_seed(1) 25 | seed1_0 = random.random() 26 | chainerrl.misc.set_random_seed(0) 27 | seed0_1 = random.random() 28 | chainerrl.misc.set_random_seed(1) 29 | seed1_1 = random.random() 30 | self.assertEqual(seed0_0, seed0_1) 31 | self.assertEqual(seed1_0, seed1_1) 32 | self.assertNotEqual(seed0_0, seed1_0) 33 | 34 | def _test_xp_random(self, xp, gpus): 35 | chainerrl.misc.set_random_seed(0, gpus=gpus) 36 | seed0_0 = xp.random.rand() 37 | chainerrl.misc.set_random_seed(1, gpus=gpus) 38 | seed1_0 = xp.random.rand() 39 | chainerrl.misc.set_random_seed(0, gpus=gpus) 40 | seed0_1 = xp.random.rand() 41 | chainerrl.misc.set_random_seed(1, gpus=gpus) 42 | seed1_1 = xp.random.rand() 43 | self.assertEqual(seed0_0, seed0_1) 44 | self.assertEqual(seed1_0, seed1_1) 45 | self.assertNotEqual(seed0_0, seed1_0) 46 | 47 | def test_numpy_random(self): 48 | self._test_xp_random(np, gpus=()) 49 | # It should ignore negative device IDs 50 | self._test_xp_random(np, gpus=(-1,)) 51 | 52 | @attr.gpu 53 | def test_cupy_random(self): 54 | self._test_xp_random(chainer.cuda.cupy, gpus=(0,)) 55 | -------------------------------------------------------------------------------- /tests/links_tests/test_mlp_bn.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import unittest 10 | 11 | import chainer 12 | import chainer.functions as F 13 | from chainer import testing 14 | from chainer.testing import attr 15 | import numpy as np 16 | 17 | import chainerrl 18 | 19 | 20 | @testing.parameterize( 21 | *testing.product({ 22 | 'in_size': [1, 5], 23 | 'out_size': [1, 3], 24 | 'hidden_sizes': [(), (1,), (1, 1), (7, 8)], 25 | 'normalize_input': [True, False], 26 | 'normalize_output': [True, False], 27 | 'nonlinearity': ['relu', 'elu'], 28 | 'last_wscale': [1, 1e-3], 29 | }) 30 | ) 31 | class TestMLPBN(unittest.TestCase): 32 | 33 | def _test_call(self, gpu): 34 | nonlinearity = getattr(F, self.nonlinearity) 35 | mlp = chainerrl.links.MLPBN( 36 | in_size=self.in_size, 37 | out_size=self.out_size, 38 | hidden_sizes=self.hidden_sizes, 39 | normalize_input=self.normalize_input, 40 | normalize_output=self.normalize_output, 41 | nonlinearity=nonlinearity, 42 | last_wscale=self.last_wscale, 43 | ) 44 | batch_size = 7 45 | x = np.random.rand(batch_size, self.in_size).astype(np.float32) 46 | if gpu >= 0: 47 | mlp.to_gpu(gpu) 48 | x = chainer.cuda.to_gpu(x) 49 | y = mlp(x) 50 | self.assertEqual(y.shape, (batch_size, self.out_size)) 51 | self.assertEqual(chainer.cuda.get_array_module(y), 52 | chainer.cuda.get_array_module(x)) 53 | 54 | def test_call_cpu(self): 55 | self._test_call(gpu=-1) 56 | 57 | @attr.gpu 58 | def test_call_gpu(self): 59 | self._test_call(gpu=0) 60 | -------------------------------------------------------------------------------- /chainerrl/functions/weighted_sum_arrays.py: -------------------------------------------------------------------------------- 1 | from chainer import cuda 2 | from chainer import function 3 | from chainer import utils 4 | from chainer.utils import type_check 5 | 6 | 7 | class WeightedSumArrays(function.Function): 8 | """Element-wise weighted sum of input arrays.""" 9 | 10 | def __init__(self, weights): 11 | self.weights = weights 12 | 13 | def check_type_forward(self, in_types): 14 | type_check.expect( 15 | in_types[0].dtype.kind == 'f', 16 | ) 17 | 18 | def forward_cpu(self, inputs): 19 | y = sum(w * x for w, x in zip(self.weights, inputs)) 20 | return utils.force_array(y), 21 | 22 | def backward(self, inputs, grads): 23 | return [w * grads[0] for w in self.weights] 24 | 25 | def forward_gpu(self, inputs): 26 | n = len(inputs) 27 | ptrs = cuda.cupy.asarray([x.data.ptr for x in inputs], 28 | dtype=cuda.cupy.int64) 29 | ws = cuda.cupy.asarray(self.weights, dtype=cuda.cupy.float32) 30 | y = cuda.elementwise( 31 | 'T x0, int64 xs, raw W ws, int32 n_xs', 32 | 'T y', 33 | 'float** xs_ = (float**) xs;' 34 | 'y = 0;' 35 | 'for (size_t j = 0; j < n_xs; ++j) {' 36 | ' y += xs_[j][i] * ws[j];' 37 | '}', 38 | 'weighted_sum_arrays'.format(n))(inputs[0], 39 | ptrs.data.ptr, 40 | ws, 41 | len(ptrs)) 42 | return y, 43 | 44 | 45 | def weighted_sum_arrays(xs, weights): 46 | """Element-wise weighted sum of input arrays. 47 | 48 | Args: 49 | xs (tuple of ~chainer.Variable or ndarray): Input arrays to be summed. 50 | weights (list of float): Weight coefficients of input arrays. 51 | 52 | Returns: 53 | ~chainer.Variable: Output variable. 54 | """ 55 | return WeightedSumArrays(weights)(*xs) 56 | -------------------------------------------------------------------------------- /chainerrl/q_functions/dueling_dqn.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import chainer 10 | from chainer import functions as F 11 | from chainer import links as L 12 | 13 | from chainerrl import action_value 14 | from chainerrl.links.mlp import MLP 15 | from chainerrl.q_function import StateQFunction 16 | 17 | 18 | class DuelingDQN(chainer.Chain, StateQFunction): 19 | """Dueling Q-Network 20 | 21 | See: http://arxiv.org/abs/1511.06581 22 | """ 23 | 24 | def __init__(self, n_actions, n_input_channels=4, 25 | activation=F.relu, bias=0.1): 26 | self.n_actions = n_actions 27 | self.n_input_channels = n_input_channels 28 | self.activation = activation 29 | 30 | super().__init__() 31 | with self.init_scope(): 32 | self.conv_layers = chainer.ChainList( 33 | L.Convolution2D(n_input_channels, 32, 8, stride=4, 34 | initial_bias=bias), 35 | L.Convolution2D(32, 64, 4, stride=2, initial_bias=bias), 36 | L.Convolution2D(64, 64, 3, stride=1, initial_bias=bias)) 37 | 38 | self.a_stream = MLP(3136, n_actions, [512]) 39 | self.v_stream = MLP(3136, 1, [512]) 40 | 41 | def __call__(self, x): 42 | h = x 43 | for l in self.conv_layers: 44 | h = self.activation(l(h)) 45 | 46 | # Advantage 47 | batch_size = x.shape[0] 48 | ya = self.a_stream(h) 49 | mean = F.reshape( 50 | F.sum(ya, axis=1) / self.n_actions, (batch_size, 1)) 51 | ya, mean = F.broadcast(ya, mean) 52 | ya -= mean 53 | 54 | # State value 55 | ys = self.v_stream(h) 56 | 57 | ya, ys = F.broadcast(ya, ys) 58 | q = ya + ys 59 | return action_value.DiscreteActionValue(q) 60 | -------------------------------------------------------------------------------- /chainerrl/links/sequence.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import chainer 10 | 11 | from chainerrl.recurrent import RecurrentChainMixin 12 | 13 | try: 14 | # For Python 3.5 and later 15 | from inspect import Parameter 16 | from inspect import signature 17 | except Exception: 18 | from funcsigs import Parameter 19 | from funcsigs import signature 20 | 21 | 22 | def accept_variable_arguments(func): 23 | for param in signature(func).parameters.values(): 24 | if param.kind in (Parameter.VAR_POSITIONAL, 25 | Parameter.VAR_KEYWORD): 26 | return True 27 | return False 28 | 29 | 30 | class Sequence(chainer.ChainList, RecurrentChainMixin): 31 | """Sequential callable Link that consists of other Links.""" 32 | 33 | def __init__(self, *layers): 34 | self.layers = list(layers) 35 | links = [layer for layer in layers if isinstance(layer, chainer.Link)] 36 | # Cache the signatures because it might be slow 37 | self.argnames = [set(signature(layer).parameters) 38 | for layer in layers] 39 | self.accept_var_args = [accept_variable_arguments(layer) 40 | for layer in layers] 41 | super().__init__(*links) 42 | 43 | def __call__(self, x, **kwargs): 44 | h = x 45 | for layer, argnames, accept_var_args in zip(self.layers, 46 | self.argnames, 47 | self.accept_var_args): 48 | if accept_var_args: 49 | layer_kwargs = kwargs 50 | else: 51 | layer_kwargs = {k: v for k, v in kwargs.items() 52 | if k in argnames} 53 | h = layer(h, **layer_kwargs) 54 | return h 55 | -------------------------------------------------------------------------------- /tests/functions_tests/test_sum_arrays.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | import unittest 8 | 9 | import numpy 10 | 11 | import chainer 12 | from chainer import cuda 13 | from chainer import gradient_check 14 | from chainer import testing 15 | from chainer.testing import attr 16 | 17 | import chainerrl 18 | 19 | 20 | @testing.parameterize( 21 | *testing.product({ 22 | 'batchsize': [1, 3], 23 | 'n': [1, 2, 7], 24 | 'shape': [(1,), (1, 1), (2,), (2, 3)], 25 | }) 26 | ) 27 | class TestSumArrays(unittest.TestCase): 28 | 29 | def setUp(self): 30 | self.batch_size = 5 31 | array_shape = (self.batchsize,) + self.shape 32 | self.xs = [numpy.random.uniform( 33 | -1, 1, array_shape).astype(numpy.float32) 34 | for _ in range(self.n)] 35 | self.gy = numpy.random.uniform( 36 | -1, 1, array_shape).astype(numpy.float32) 37 | 38 | def check_forward(self, xs): 39 | y = chainerrl.functions.sum_arrays(xs) 40 | correct_y = sum(self.xs) 41 | gradient_check.assert_allclose(correct_y, cuda.to_cpu(y.array)) 42 | 43 | def test_forward_cpu(self): 44 | self.check_forward(self.xs) 45 | 46 | @attr.gpu 47 | def test_forward_gpu(self): 48 | xs_gpu = [chainer.cuda.to_gpu(x) for x in self.xs] 49 | self.check_forward(xs_gpu) 50 | 51 | def check_backward(self, x_data, y_grad): 52 | gradient_check.check_backward( 53 | chainerrl.functions.SumArrays(), 54 | x_data, y_grad, eps=1e-2, rtol=1e-2) 55 | 56 | def test_backward_cpu(self): 57 | self.check_backward(self.xs, self.gy) 58 | 59 | @attr.gpu 60 | def test_backward_gpu(self): 61 | xs_gpu = [chainer.cuda.to_gpu(x) for x in self.xs] 62 | self.check_backward(xs_gpu, cuda.to_gpu(self.gy)) 63 | 64 | 65 | testing.run_module(__name__, __file__) 66 | -------------------------------------------------------------------------------- /chainerrl/agents/residual_dqn.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | import chainer.functions as F 9 | 10 | from chainerrl.agents.dqn import DQN 11 | from chainerrl.functions import scale_grad 12 | 13 | 14 | class ResidualDQN(DQN): 15 | """DQN that allows maxQ also backpropagate gradients.""" 16 | 17 | def __init__(self, *args, **kwargs): 18 | self.grad_scale = kwargs.pop('grad_scale', 1.0) 19 | super().__init__(*args, **kwargs) 20 | 21 | def sync_target_network(self): 22 | pass 23 | 24 | def _compute_target_values(self, exp_batch, gamma): 25 | 26 | batch_next_state = exp_batch['next_state'] 27 | 28 | target_next_qout = self.q_function(batch_next_state) 29 | next_q_max = target_next_qout.max 30 | 31 | batch_rewards = exp_batch['reward'] 32 | batch_terminal = exp_batch['is_state_terminal'] 33 | 34 | return batch_rewards + self.gamma * (1.0 - batch_terminal) * next_q_max 35 | 36 | def _compute_y_and_t(self, exp_batch, gamma): 37 | 38 | batch_state = exp_batch['state'] 39 | batch_size = len(batch_state) 40 | 41 | # Compute Q-values for current states 42 | qout = self.q_function(batch_state) 43 | 44 | batch_actions = exp_batch['action'] 45 | batch_q = F.reshape(qout.evaluate_actions( 46 | batch_actions), (batch_size, 1)) 47 | 48 | # Target values must also backprop gradients 49 | batch_q_target = F.reshape( 50 | self._compute_target_values(exp_batch, gamma), (batch_size, 1)) 51 | 52 | return batch_q, scale_grad.scale_grad(batch_q_target, self.grad_scale) 53 | 54 | @property 55 | def saved_attributes(self): 56 | # ResidualDQN doesn't use target models 57 | return ('model', 'optimizer') 58 | 59 | def input_initial_batch_to_target_model(self, batch): 60 | pass 61 | -------------------------------------------------------------------------------- /tests/links_tests/test_sequence.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import unittest 10 | 11 | from chainerrl.links import Sequence 12 | 13 | 14 | class TestSequence(unittest.TestCase): 15 | 16 | def test_call(self): 17 | 18 | def func_a(x): 19 | return x + 1 20 | 21 | b_test_mode = [False] 22 | 23 | def func_b(x, test=False): 24 | b_test_mode[0] = test 25 | return x + 1 26 | 27 | c_test_mode = [False] 28 | c_hoge_mode = [False] 29 | 30 | def func_c(x, test=False, hoge=False): 31 | c_test_mode[0] = test 32 | c_hoge_mode[0] = hoge 33 | return x + 1 34 | 35 | def _test_call(seq): 36 | 37 | out = seq(1) 38 | self.assertEqual(out, 4) 39 | self.assertFalse(b_test_mode[0]) 40 | self.assertFalse(c_test_mode[0]) 41 | self.assertFalse(c_hoge_mode[0]) 42 | 43 | out = seq(1, test=True) 44 | self.assertEqual(out, 4) 45 | self.assertTrue(b_test_mode[0]) 46 | self.assertTrue(c_test_mode[0]) 47 | self.assertFalse(c_hoge_mode[0]) 48 | 49 | out = seq(1, test=True, hoge=True) 50 | self.assertEqual(out, 4) 51 | self.assertTrue(b_test_mode[0]) 52 | self.assertTrue(c_test_mode[0]) 53 | self.assertTrue(c_hoge_mode[0]) 54 | 55 | out = seq(1, test=False, hoge=True) 56 | self.assertEqual(out, 4) 57 | self.assertFalse(b_test_mode[0]) 58 | self.assertFalse(c_test_mode[0]) 59 | self.assertTrue(c_hoge_mode[0]) 60 | 61 | _test_call(Sequence(func_a, func_b, func_c)) 62 | _test_call(Sequence(Sequence(func_a, func_b, func_c))) 63 | _test_call(Sequence(Sequence(func_a), 64 | Sequence(func_b), Sequence(func_c))) 65 | -------------------------------------------------------------------------------- /chainerrl/policies/softmax_policy.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from logging import getLogger 10 | 11 | import chainer 12 | from chainer import functions as F 13 | 14 | from chainerrl import distribution 15 | from chainerrl.links.mlp import MLP 16 | from chainerrl.policy import Policy 17 | 18 | 19 | logger = getLogger(__name__) 20 | 21 | 22 | class SoftmaxPolicy(chainer.Chain, Policy): 23 | """Softmax policy that uses Boltzmann distributions. 24 | 25 | Args: 26 | model (chainer.Link): 27 | Link that is callable and outputs action values. 28 | beta (float): 29 | Parameter of Boltzmann distributions. 30 | """ 31 | 32 | def __init__(self, model, beta=1.0, min_prob=0.0): 33 | self.beta = beta 34 | self.min_prob = min_prob 35 | super().__init__(model=model) 36 | 37 | def __call__(self, x): 38 | h = self.model(x) 39 | return distribution.SoftmaxDistribution( 40 | h, beta=self.beta, min_prob=self.min_prob) 41 | 42 | 43 | class FCSoftmaxPolicy(SoftmaxPolicy): 44 | """Softmax policy that consists of FC layers and rectifiers""" 45 | 46 | def __init__(self, n_input_channels, n_actions, 47 | n_hidden_layers=0, n_hidden_channels=None, 48 | beta=1.0, nonlinearity=F.relu, 49 | last_wscale=1.0, 50 | min_prob=0.0): 51 | self.n_input_channels = n_input_channels 52 | self.n_actions = n_actions 53 | self.n_hidden_layers = n_hidden_layers 54 | self.n_hidden_channels = n_hidden_channels 55 | self.beta = beta 56 | 57 | super().__init__( 58 | model=MLP(n_input_channels, 59 | n_actions, 60 | (n_hidden_channels,) * n_hidden_layers, 61 | nonlinearity=nonlinearity, 62 | last_wscale=last_wscale), 63 | beta=self.beta, 64 | min_prob=min_prob) 65 | -------------------------------------------------------------------------------- /tests/explorers_tests/test_epsilon_greedy.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | 8 | import logging 9 | import unittest 10 | 11 | from chainerrl.explorers import epsilon_greedy 12 | 13 | 14 | class TestEpsilonGreedy(unittest.TestCase): 15 | 16 | def test_linear_decay_epsilon_greedy(self): 17 | 18 | random_action_func_count = [0] 19 | greedy_action_func_count = [0] 20 | 21 | def random_action_func(): 22 | random_action_func_count[0] += 1 23 | return 0 24 | 25 | def greedy_action_func(): 26 | greedy_action_func_count[0] += 1 27 | return 0 28 | 29 | explorer = epsilon_greedy.LinearDecayEpsilonGreedy(1.0, 0.1, 50, 30 | random_action_func) 31 | 32 | explorer.logger.addHandler(logging.StreamHandler()) 33 | explorer.logger.setLevel(logging.DEBUG) 34 | 35 | self.assertAlmostEqual(explorer.epsilon, 1.0) 36 | 37 | for t in range(100): 38 | explorer.select_action(t, greedy_action_func) 39 | 40 | self.assertAlmostEqual(explorer.epsilon, 0.1) 41 | 42 | def test_constant_epsilon_greedy(self): 43 | 44 | random_action_func_count = [0] 45 | greedy_action_func_count = [0] 46 | 47 | def random_action_func(): 48 | random_action_func_count[0] += 1 49 | return 0 50 | 51 | def greedy_action_func(): 52 | greedy_action_func_count[0] += 1 53 | return 0 54 | 55 | explorer = epsilon_greedy.ConstantEpsilonGreedy(0.1, 56 | random_action_func) 57 | 58 | explorer.logger.addHandler(logging.StreamHandler()) 59 | explorer.logger.setLevel(logging.DEBUG) 60 | 61 | self.assertAlmostEqual(explorer.epsilon, 0.1) 62 | 63 | for t in range(100): 64 | explorer.select_action(t, greedy_action_func) 65 | 66 | self.assertAlmostEqual(explorer.epsilon, 0.1) 67 | -------------------------------------------------------------------------------- /tests/wrappers_tests/test_cast_observation.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | 10 | import unittest 11 | 12 | from chainer import testing 13 | import gym 14 | import numpy as np 15 | 16 | import chainerrl 17 | 18 | 19 | @testing.parameterize(*testing.product({ 20 | 'env_id': ['CartPole-v1', 'Pendulum-v0'], 21 | 'dtype': [np.float16, np.float32, np.float64] 22 | })) 23 | class TestCastObservation(unittest.TestCase): 24 | 25 | def test_cast_observation(self): 26 | env = chainerrl.wrappers.CastObservation( 27 | gym.make(self.env_id), dtype=self.dtype) 28 | rtol = 1e-3 if self.dtype == np.float16 else 1e-7 29 | 30 | obs = env.reset() 31 | self.assertEqual(env.original_observation.dtype, np.float64) 32 | self.assertEqual(obs.dtype, self.dtype) 33 | np.testing.assert_allclose(env.original_observation, obs, rtol=rtol) 34 | 35 | obs, r, done, info = env.step(env.action_space.sample()) 36 | 37 | self.assertEqual(env.original_observation.dtype, np.float64) 38 | self.assertEqual(obs.dtype, self.dtype) 39 | np.testing.assert_allclose(env.original_observation, obs, rtol=rtol) 40 | 41 | 42 | @testing.parameterize(*testing.product({ 43 | 'env_id': ['CartPole-v1', 'Pendulum-v0'], 44 | })) 45 | class TestCastObservationToFloat32(unittest.TestCase): 46 | 47 | def test_cast_observation(self): 48 | env = chainerrl.wrappers.CastObservationToFloat32( 49 | gym.make(self.env_id)) 50 | 51 | obs = env.reset() 52 | self.assertEqual(env.original_observation.dtype, np.float64) 53 | self.assertEqual(obs.dtype, np.float32) 54 | np.testing.assert_allclose(env.original_observation, obs) 55 | 56 | obs, r, done, info = env.step(env.action_space.sample()) 57 | self.assertEqual(env.original_observation.dtype, np.float64) 58 | self.assertEqual(obs.dtype, np.float32) 59 | np.testing.assert_allclose(env.original_observation, obs) 60 | -------------------------------------------------------------------------------- /chainerrl/agents/double_pal.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import chainer 10 | from chainer import functions as F 11 | 12 | from chainerrl.agents import pal 13 | from chainerrl.recurrent import state_kept 14 | 15 | 16 | class DoublePAL(pal.PAL): 17 | 18 | def _compute_y_and_t(self, exp_batch): 19 | 20 | batch_state = exp_batch['state'] 21 | batch_size = len(exp_batch['reward']) 22 | 23 | qout = self.q_function(batch_state) 24 | 25 | batch_actions = exp_batch['action'] 26 | batch_q = qout.evaluate_actions(batch_actions) 27 | 28 | # Compute target values 29 | 30 | with chainer.no_backprop_mode(): 31 | target_qout = self.target_q_function(batch_state) 32 | 33 | batch_next_state = exp_batch['next_state'] 34 | 35 | with state_kept(self.q_function): 36 | next_qout = self.q_function(batch_next_state) 37 | 38 | with state_kept(self.target_q_function): 39 | target_next_qout = self.target_q_function( 40 | batch_next_state) 41 | next_q_max = F.reshape(target_next_qout.evaluate_actions( 42 | next_qout.greedy_actions), (batch_size,)) 43 | 44 | batch_rewards = exp_batch['reward'] 45 | batch_terminal = exp_batch['is_state_terminal'] 46 | 47 | # T Q: Bellman operator 48 | t_q = batch_rewards + exp_batch['discount'] * \ 49 | (1.0 - batch_terminal) * next_q_max 50 | 51 | # T_PAL Q: persistent advantage learning operator 52 | cur_advantage = F.reshape( 53 | target_qout.compute_advantage(batch_actions), (batch_size,)) 54 | next_advantage = F.reshape( 55 | target_next_qout.compute_advantage(batch_actions), 56 | (batch_size,)) 57 | tpal_q = t_q + self.alpha * \ 58 | F.maximum(cur_advantage, next_advantage) 59 | 60 | return batch_q, tpal_q 61 | -------------------------------------------------------------------------------- /chainerrl/functions/invert_gradients.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division 3 | from __future__ import absolute_import 4 | from future import standard_library 5 | standard_library.install_aliases() # NOQA 6 | 7 | from chainer import cuda 8 | from chainer import function 9 | from chainer.utils import type_check 10 | 11 | 12 | class InvertGradients(function.Function): 13 | """Inverts gradients of values exceeding a given range. 14 | 15 | See: http://arxiv.org/abs/1511.04143 16 | """ 17 | 18 | def __init__(self, range_min, range_max): 19 | self.range_min = range_min 20 | self.range_max = range_max 21 | self.range_width = self.range_max - self.range_min 22 | assert (self.range_width > 0).all() 23 | 24 | def check_type_forward(self, in_types): 25 | type_check.expect(in_types.size() == 1,) 26 | 27 | @property 28 | def label(self): 29 | return 'InvertGradients' 30 | 31 | def forward(self, inputs): 32 | return inputs 33 | 34 | def backward(self, inputs, grad_outputs): 35 | x, = inputs 36 | gy, = grad_outputs 37 | # In chainer, update will be like x.array -= lr * x.grad, 38 | # which means negative gradients will increase values. 39 | increasing = (gy < 0).astype(gy.dtype) 40 | gx = gy.copy() 41 | gx *= ((self.range_max - x) / self.range_width * increasing + 42 | (x - self.range_min) / self.range_width * (1 - increasing)) 43 | return gx, 44 | 45 | 46 | def invert_gradients(x, range_min, range_max): 47 | """Inverts gradients of values exceeding a given range. 48 | 49 | See: http://arxiv.org/abs/1511.04143 50 | 51 | Args: 52 | x (chainer.Variable or ndarray): Input value. 53 | range_min (chainer.Variable or ndarray): Minimum of the value range. 54 | range_max (chainer.Variable or ndarray): Maximum of the value range. 55 | Returns: 56 | The same value as x, except that the gradients backpropagated is scaled 57 | and inverted so that values would be in a given range after update. 58 | """ 59 | xp = cuda.get_array_module(x, x.array) 60 | return InvertGradients(xp.asarray(range_min), xp.asarray(range_max))(x) 61 | -------------------------------------------------------------------------------- /tests/functions_tests/test_weighted_sum_arrays.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | import unittest 8 | 9 | import numpy 10 | 11 | import chainer 12 | from chainer import cuda 13 | from chainer import gradient_check 14 | from chainer import testing 15 | from chainer.testing import attr 16 | 17 | import chainerrl 18 | 19 | 20 | @testing.parameterize( 21 | *testing.product({ 22 | 'batchsize': [1, 3], 23 | 'n': [1, 2, 7], 24 | 'shape': [(1,), (1, 1), (2,), (2, 3)], 25 | }) 26 | ) 27 | class TestSumArrays(unittest.TestCase): 28 | 29 | def setUp(self): 30 | self.batch_size = 5 31 | array_shape = (self.batchsize,) + self.shape 32 | self.xs = [numpy.random.uniform( 33 | -1, 1, array_shape).astype(numpy.float32) 34 | for _ in range(self.n)] 35 | self.weights = [numpy.random.rand() for _ in range(self.n)] 36 | self.gy = numpy.random.uniform( 37 | -1, 1, array_shape).astype(numpy.float32) 38 | 39 | def check_forward(self, xs): 40 | y = chainerrl.functions.weighted_sum_arrays(xs, weights=self.weights) 41 | correct_y = sum(x * w for x, w in zip(self.xs, self.weights)) 42 | gradient_check.assert_allclose(correct_y, cuda.to_cpu(y.array)) 43 | 44 | def test_forward_cpu(self): 45 | self.check_forward(self.xs) 46 | 47 | @attr.gpu 48 | def test_forward_gpu(self): 49 | xs_gpu = [chainer.cuda.to_gpu(x) for x in self.xs] 50 | self.check_forward(xs_gpu) 51 | 52 | def check_backward(self, x_data, y_grad): 53 | gradient_check.check_backward( 54 | chainerrl.functions.WeightedSumArrays(self.weights), 55 | x_data, y_grad, eps=1e-2, rtol=1e-2) 56 | 57 | def test_backward_cpu(self): 58 | self.check_backward(self.xs, self.gy) 59 | 60 | @attr.gpu 61 | def test_backward_gpu(self): 62 | xs_gpu = [chainer.cuda.to_gpu(x) for x in self.xs] 63 | self.check_backward(xs_gpu, cuda.to_gpu(self.gy)) 64 | 65 | 66 | testing.run_module(__name__, __file__) 67 | -------------------------------------------------------------------------------- /chainerrl/links/dqn_head.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | import chainer 9 | from chainer import functions as F 10 | from chainer import links as L 11 | 12 | 13 | class NatureDQNHead(chainer.ChainList): 14 | """DQN's head (Nature version)""" 15 | 16 | def __init__(self, n_input_channels=4, n_output_channels=512, 17 | activation=F.relu, bias=0.1): 18 | self.n_input_channels = n_input_channels 19 | self.activation = activation 20 | self.n_output_channels = n_output_channels 21 | 22 | layers = [ 23 | L.Convolution2D(n_input_channels, 32, 8, stride=4, 24 | initial_bias=bias), 25 | L.Convolution2D(32, 64, 4, stride=2, initial_bias=bias), 26 | L.Convolution2D(64, 64, 3, stride=1, initial_bias=bias), 27 | L.Linear(3136, n_output_channels, initial_bias=bias), 28 | ] 29 | 30 | super(NatureDQNHead, self).__init__(*layers) 31 | 32 | def __call__(self, state): 33 | h = state 34 | for layer in self: 35 | h = self.activation(layer(h)) 36 | return h 37 | 38 | 39 | class NIPSDQNHead(chainer.ChainList): 40 | """DQN's head (NIPS workshop version)""" 41 | 42 | def __init__(self, n_input_channels=4, n_output_channels=256, 43 | activation=F.relu, bias=0.1): 44 | self.n_input_channels = n_input_channels 45 | self.activation = activation 46 | self.n_output_channels = n_output_channels 47 | 48 | layers = [ 49 | L.Convolution2D(n_input_channels, 16, 8, stride=4, 50 | initial_bias=bias), 51 | L.Convolution2D(16, 32, 4, stride=2, initial_bias=bias), 52 | L.Linear(2592, n_output_channels, initial_bias=bias), 53 | ] 54 | 55 | super(NIPSDQNHead, self).__init__(*layers) 56 | 57 | def __call__(self, state): 58 | h = state 59 | for layer in self: 60 | h = self.activation(layer(h)) 61 | return h 62 | -------------------------------------------------------------------------------- /chainerrl/experiments/hooks.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from abc import ABCMeta 10 | from abc import abstractmethod 11 | 12 | from future.utils import with_metaclass 13 | import numpy as np 14 | 15 | 16 | class StepHook(with_metaclass(ABCMeta, object)): 17 | """Hook function that will be called in training. 18 | 19 | This class is for clarifying the interface required for Hook functions. 20 | You don't need to inherit this class to define your own hooks. Any callable 21 | that accepts (env, agent, step) as arguments can be used as a hook. 22 | """ 23 | 24 | @abstractmethod 25 | def __call__(self, env, agent, step): 26 | """Call the hook. 27 | 28 | Args: 29 | env: Environment. 30 | agent: Agent. 31 | step: Current timestep. 32 | """ 33 | raise NotImplementedError 34 | 35 | 36 | class LinearInterpolationHook(StepHook): 37 | """Hook that will set a linearly interpolated value. 38 | 39 | You can use this hook to decay the learning rate by using a setter function 40 | as follows: 41 | 42 | .. code-block:: python 43 | 44 | def lr_setter(env, agent, value): 45 | agent.optimizer.lr = value 46 | 47 | hook = LinearInterpolationHook(10 ** 6, 1e-3, 0, lr_setter) 48 | 49 | 50 | Args: 51 | total_steps (int): Number of total steps. 52 | start_value (float): Start value. 53 | stop_value (float): Stop value. 54 | setter (callable): (env, agent, value) -> None 55 | """ 56 | 57 | def __init__(self, total_steps, start_value, stop_value, setter): 58 | self.total_steps = total_steps 59 | self.start_value = start_value 60 | self.stop_value = stop_value 61 | self.setter = setter 62 | 63 | def __call__(self, env, agent, step): 64 | value = np.interp(step, 65 | [1, self.total_steps], 66 | [self.start_value, self.stop_value]) 67 | self.setter(env, agent, value) 68 | -------------------------------------------------------------------------------- /chainerrl/agents/al.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import chainer 10 | from chainer import functions as F 11 | 12 | from chainerrl.agents import dqn 13 | from chainerrl.recurrent import state_kept 14 | 15 | 16 | class AL(dqn.DQN): 17 | """Advantage Learning. 18 | 19 | See: http://arxiv.org/abs/1512.04860. 20 | 21 | Args: 22 | alpha (float): Weight of (persistent) advantages. Convergence 23 | is guaranteed only for alpha in [0, 1). 24 | 25 | For other arguments, see DQN. 26 | """ 27 | 28 | def __init__(self, *args, **kwargs): 29 | self.alpha = kwargs.pop('alpha', 0.9) 30 | super().__init__(*args, **kwargs) 31 | 32 | def _compute_y_and_t(self, exp_batch): 33 | 34 | batch_state = exp_batch['state'] 35 | batch_size = len(exp_batch['reward']) 36 | 37 | qout = self.q_function(batch_state) 38 | 39 | batch_actions = exp_batch['action'] 40 | 41 | batch_q = qout.evaluate_actions(batch_actions) 42 | 43 | # Compute target values 44 | 45 | with chainer.no_backprop_mode(): 46 | target_qout = self.target_q_function(batch_state) 47 | 48 | batch_next_state = exp_batch['next_state'] 49 | 50 | with state_kept(self.target_q_function): 51 | target_next_qout = self.target_q_function( 52 | batch_next_state) 53 | next_q_max = F.reshape(target_next_qout.max, (batch_size,)) 54 | 55 | batch_rewards = exp_batch['reward'] 56 | batch_terminal = exp_batch['is_state_terminal'] 57 | 58 | # T Q: Bellman operator 59 | t_q = batch_rewards + exp_batch['discount'] * \ 60 | (1.0 - batch_terminal) * next_q_max 61 | 62 | # T_AL Q: advantage learning operator 63 | cur_advantage = F.reshape( 64 | target_qout.compute_advantage(batch_actions), (batch_size,)) 65 | tal_q = t_q + self.alpha * cur_advantage 66 | 67 | return batch_q, tal_q 68 | 69 | def input_initial_batch_to_target_model(self, batch): 70 | pass 71 | -------------------------------------------------------------------------------- /tests/misc_tests/test_copy_param.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | import unittest 8 | 9 | import chainer 10 | from chainer import links as L 11 | import numpy as np 12 | 13 | from chainerrl.misc import copy_param 14 | 15 | 16 | class TestCopyParam(unittest.TestCase): 17 | 18 | def test_copy_param(self): 19 | a = L.Linear(1, 5) 20 | b = L.Linear(1, 5) 21 | 22 | s = chainer.Variable(np.random.rand(1, 1).astype(np.float32)) 23 | a_out = list(a(s).array.ravel()) 24 | b_out = list(b(s).array.ravel()) 25 | self.assertNotEqual(a_out, b_out) 26 | 27 | # Copy b's parameters to a 28 | copy_param.copy_param(a, b) 29 | 30 | a_out_new = list(a(s).array.ravel()) 31 | b_out_new = list(b(s).array.ravel()) 32 | self.assertEqual(a_out_new, b_out) 33 | self.assertEqual(b_out_new, b_out) 34 | 35 | def test_copy_param_type_check(self): 36 | a = L.Linear(None, 5) 37 | b = L.Linear(1, 5) 38 | 39 | with self.assertRaises(TypeError): 40 | # Copy b's parameters to a, but since `a` parameter is not 41 | # initialized, it should raise error. 42 | copy_param.copy_param(a, b) 43 | 44 | def test_soft_copy_param(self): 45 | a = L.Linear(1, 5) 46 | b = L.Linear(1, 5) 47 | 48 | a.W.array[:] = 0.5 49 | b.W.array[:] = 1 50 | 51 | # a = (1 - tau) * a + tau * b 52 | copy_param.soft_copy_param(target_link=a, source_link=b, tau=0.1) 53 | 54 | np.testing.assert_almost_equal(a.W.array, np.full(a.W.shape, 0.55)) 55 | np.testing.assert_almost_equal(b.W.array, np.full(b.W.shape, 1.0)) 56 | 57 | copy_param.soft_copy_param(target_link=a, source_link=b, tau=0.1) 58 | 59 | np.testing.assert_almost_equal(a.W.array, np.full(a.W.shape, 0.595)) 60 | np.testing.assert_almost_equal(b.W.array, np.full(b.W.shape, 1.0)) 61 | 62 | def test_soft_copy_param_type_check(self): 63 | a = L.Linear(None, 5) 64 | b = L.Linear(1, 5) 65 | 66 | with self.assertRaises(TypeError): 67 | copy_param.soft_copy_param(target_link=a, source_link=b, tau=0.1) 68 | -------------------------------------------------------------------------------- /chainerrl/misc/draw_computational_graph.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import subprocess 10 | 11 | import chainer.computational_graph 12 | import chainerrl 13 | 14 | 15 | def collect_variables(obj): 16 | """Collect Variable objects inside a given object. 17 | 18 | Args: 19 | obj (object): Object to collect Variable objects from. 20 | Returns: 21 | List of Variable objects. 22 | """ 23 | variables = [] 24 | if isinstance(obj, chainer.Variable): 25 | return [obj] 26 | elif isinstance(obj, chainerrl.action_value.ActionValue): 27 | return list(obj.params) 28 | elif isinstance(obj, chainerrl.distribution.Distribution): 29 | return list(obj.params) 30 | elif isinstance(obj, (list, tuple)): 31 | variables = [] 32 | for child in obj: 33 | variables.extend(collect_variables(child)) 34 | return variables 35 | 36 | 37 | def is_graphviz_available(): 38 | return chainerrl.misc.is_return_code_zero(['dot', '-V']) 39 | 40 | 41 | def draw_computational_graph(outputs, filepath): 42 | """Draw a computational graph and write to a given file. 43 | 44 | Args: 45 | outputs (object): Output(s) of the computational graph. It must be 46 | a Variable, an ActionValue, a Distribution or a list of them. 47 | filepath (str): Filepath to write a graph without file extention. 48 | A DOT file will be saved with ".gv" extension added. 49 | If Graphviz's dot command is available, a PNG file will also be 50 | saved with ".png" extension added. 51 | """ 52 | variables = collect_variables(outputs) 53 | g = chainer.computational_graph.build_computational_graph(variables) 54 | gv_filepath = filepath + '.gv' 55 | with open(gv_filepath, 'w') as f: 56 | # future.builtins.str is required to make sure the content is unicode 57 | # in both py2 and py3 58 | f.write(str(g.dump())) 59 | if is_graphviz_available(): 60 | png_filepath = filepath + '.png' 61 | subprocess.check_call( 62 | ['dot', '-Tpng', gv_filepath, '-o', png_filepath]) 63 | -------------------------------------------------------------------------------- /chainerrl/explorers/additive_ou.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from logging import getLogger 10 | 11 | import numpy as np 12 | 13 | from chainerrl import explorer 14 | 15 | 16 | class AdditiveOU(explorer.Explorer): 17 | """Additive Ornstein-Uhlenbeck process. 18 | 19 | Used in https://arxiv.org/abs/1509.02971 for exploration. 20 | 21 | Args: 22 | mu (float): Mean of the OU process 23 | theta (float): Friction to pull towards the mean 24 | sigma (float or ndarray): Scale of noise 25 | start_with_mu (bool): Start the process without noise 26 | """ 27 | 28 | def __init__(self, mu=0.0, theta=0.15, sigma=0.3, start_with_mu=False, 29 | logger=getLogger(__name__)): 30 | self.mu = mu 31 | self.theta = theta 32 | self.sigma = sigma 33 | self.start_with_mu = start_with_mu 34 | self.logger = logger 35 | self.ou_state = None 36 | 37 | def evolve(self): 38 | # dx = theta (mu - x) + sigma dW 39 | # for a Wiener process W 40 | noise = np.random.normal(size=self.ou_state.shape, loc=0, 41 | scale=self.sigma) 42 | self.ou_state += self.theta * (self.mu - self.ou_state) + noise 43 | 44 | def select_action(self, t, greedy_action_func, action_value=None): 45 | a = greedy_action_func() 46 | if self.ou_state is None: 47 | if self.start_with_mu: 48 | self.ou_state = np.full(a.shape, self.mu, dtype=np.float32) 49 | else: 50 | sigma_stable = (self.sigma / 51 | np.sqrt(2 * self.theta - self.theta ** 2)) 52 | self.ou_state = np.random.normal( 53 | size=a.shape, 54 | loc=self.mu, scale=sigma_stable).astype(np.float32) 55 | else: 56 | self.evolve() 57 | noise = self.ou_state 58 | self.logger.debug('t:%s noise:%s', t, noise) 59 | return a + noise 60 | 61 | def __repr__(self): 62 | return 'AdditiveOU(mu={}, theta={}, sigma={})'.format( 63 | self.mu, self.theta, self.sigma) 64 | -------------------------------------------------------------------------------- /chainerrl/misc/env_modifiers.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | from builtins import * # NOQA 7 | standard_library.install_aliases() # NOQA 8 | 9 | import numpy as np 10 | 11 | 12 | def make_rendered(env, *render_args, **render_kwargs): 13 | base_step = env.step 14 | base_close = env.close 15 | 16 | def step(action): 17 | ret = base_step(action) 18 | env.render(*render_args, **render_kwargs) 19 | return ret 20 | 21 | def close(): 22 | env.render(*render_args, close=True, **render_kwargs) 23 | base_close() 24 | 25 | env.step = step 26 | env.close = close 27 | 28 | 29 | def make_timestep_limited(env, timestep_limit): 30 | t = [1] 31 | old_step = env.step 32 | old_reset = env.reset 33 | 34 | def step(action): 35 | observation, reward, done, info = old_step(action) 36 | if t[0] >= timestep_limit: 37 | done = True 38 | t[0] += 1 39 | return observation, reward, done, info 40 | 41 | def reset(): 42 | t[0] = 1 43 | return old_reset() 44 | 45 | env.step = step 46 | env.reset = reset 47 | 48 | 49 | def make_action_filtered(env, action_filter): 50 | old_step = env.step 51 | 52 | def step(action): 53 | return old_step(action_filter(action)) 54 | 55 | env.step = step 56 | 57 | 58 | def make_reward_filtered(env, reward_filter): 59 | old_step = env.step 60 | 61 | def step(action): 62 | observation, reward, done, info = old_step(action) 63 | reward = reward_filter(reward) 64 | return observation, reward, done, info 65 | 66 | env.step = step 67 | 68 | 69 | def make_reward_clipped(env, low, high): 70 | make_reward_filtered(env, lambda x: np.clip(x, low, high)) 71 | 72 | 73 | def make_action_repeated(env, n_times): 74 | """Repeat received actions. 75 | 76 | - Rewards are accumulated while repeating. 77 | - Only latest observations are returned. 78 | """ 79 | old_step = env.step 80 | 81 | def step(action): 82 | r_total = 0 83 | for _ in range(n_times): 84 | obs, r, done, info = old_step(action) 85 | r_total += r 86 | if done: 87 | break 88 | return obs, r_total, done, info 89 | 90 | env.step = step 91 | -------------------------------------------------------------------------------- /tests/wrappers_tests/test_render.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import unittest 10 | 11 | from chainer import testing 12 | import mock 13 | 14 | import chainerrl 15 | 16 | 17 | @testing.parameterize(*testing.product({ 18 | 'render_kwargs': [ 19 | {}, 20 | {'mode': 'human'}, 21 | {'mode': 'rgb_array'}, 22 | ] 23 | })) 24 | class TestRender(unittest.TestCase): 25 | 26 | def test(self): 27 | orig_env = mock.Mock() 28 | # Reaches the terminal state after five actions 29 | orig_env.reset.side_effect = [ 30 | ('state', 0), 31 | ('state', 3), 32 | ] 33 | orig_env.step.side_effect = [ 34 | (('state', 1), 0, False, {}), 35 | (('state', 2), 1, True, {}), 36 | ] 37 | env = chainerrl.wrappers.Render(orig_env, **self.render_kwargs) 38 | 39 | # Not called env.render yet 40 | self.assertEqual(orig_env.render.call_count, 0) 41 | 42 | obs = env.reset() 43 | self.assertEqual(obs, ('state', 0)) 44 | 45 | # Called once 46 | self.assertEqual(orig_env.render.call_count, 1) 47 | 48 | obs, reward, done, info = env.step(0) 49 | self.assertEqual(obs, ('state', 1)) 50 | self.assertEqual(reward, 0) 51 | self.assertEqual(done, False) 52 | self.assertEqual(info, {}) 53 | 54 | # Called twice 55 | self.assertEqual(orig_env.render.call_count, 2) 56 | 57 | obs, reward, done, info = env.step(0) 58 | self.assertEqual(obs, ('state', 2)) 59 | self.assertEqual(reward, 1) 60 | self.assertEqual(done, True) 61 | self.assertEqual(info, {}) 62 | 63 | # Called thrice 64 | self.assertEqual(orig_env.render.call_count, 3) 65 | 66 | obs = env.reset() 67 | self.assertEqual(obs, ('state', 3)) 68 | 69 | # Called four times 70 | self.assertEqual(orig_env.render.call_count, 4) 71 | 72 | # All the calls should receive correct kwargs 73 | for call in orig_env.render.call_args_list: 74 | args, kwargs = call 75 | self.assertEqual(len(args), 0) 76 | self.assertEqual(kwargs, self.render_kwargs) 77 | -------------------------------------------------------------------------------- /tests/agents_tests/test_dpp.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from chainer import testing 10 | 11 | import basetest_dqn_like as base 12 | from basetest_training import _TestBatchTrainingMixin 13 | from chainerrl.agents.dpp import DPP 14 | from chainerrl.agents.dpp import DPPGreedy 15 | from chainerrl.agents.dpp import DPPL 16 | 17 | 18 | def parse_dpp_agent(dpp_type): 19 | return {'DPP': DPP, 20 | 'DPPL': DPPL, 21 | 'DPPGreedy': DPPGreedy}[dpp_type] 22 | 23 | 24 | @testing.parameterize( 25 | *testing.product({ 26 | 'dpp_type': ['DPP', 'DPPL', 'DPPGreedy'], 27 | }) 28 | ) 29 | class TestDPPOnDiscreteABC( 30 | _TestBatchTrainingMixin, 31 | base._TestDQNOnDiscreteABC): 32 | 33 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 34 | agent_class = parse_dpp_agent(self.dpp_type) 35 | return agent_class( 36 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 37 | replay_start_size=100, target_update_interval=100) 38 | 39 | 40 | # DPP and DPPL don't support continuous action spaces 41 | @testing.parameterize( 42 | *testing.product({ 43 | 'dpp_type': ['DPPGreedy'], 44 | }) 45 | ) 46 | class TestDPPOnContinuousABC( 47 | _TestBatchTrainingMixin, 48 | base._TestDQNOnContinuousABC): 49 | 50 | def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 51 | agent_class = parse_dpp_agent(self.dpp_type) 52 | return agent_class( 53 | q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 54 | replay_start_size=100, target_update_interval=100) 55 | 56 | 57 | # Currently DPP doesn't work with recurrent models 58 | # TODO(fujita) make it work 59 | 60 | # @testing.parameterize( 61 | # *testing.product({ 62 | # 'dpp_type': ['DPP', 'DPPL', 'DPPGreedy'], 63 | # }), 64 | # ) 65 | # class TestDPPOnDiscretePOABC(base._TestDQNOnDiscretePOABC): 66 | # 67 | # def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): 68 | # agent_class = parse_dpp_agent(self.dpp_type) 69 | # return agent_class( 70 | # q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer, 71 | # replay_start_size=100, target_update_interval=100, 72 | # episodic_update=True) 73 | -------------------------------------------------------------------------------- /chainerrl/agents/pal.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import chainer 10 | from chainer import functions as F 11 | 12 | from chainerrl.agents import dqn 13 | from chainerrl.recurrent import state_kept 14 | 15 | 16 | class PAL(dqn.DQN): 17 | """Persistent Advantage Learning. 18 | 19 | See: http://arxiv.org/abs/1512.04860. 20 | 21 | Args: 22 | alpha (float): Weight of (persistent) advantages. Convergence 23 | is guaranteed only for alpha in [0, 1). 24 | 25 | For other arguments, see DQN. 26 | """ 27 | 28 | def __init__(self, *args, **kwargs): 29 | self.alpha = kwargs.pop('alpha', 0.9) 30 | super().__init__(*args, **kwargs) 31 | 32 | def _compute_y_and_t(self, exp_batch): 33 | 34 | batch_state = exp_batch['state'] 35 | batch_size = len(exp_batch['reward']) 36 | 37 | qout = self.q_function(batch_state) 38 | 39 | batch_actions = exp_batch['action'] 40 | batch_q = qout.evaluate_actions(batch_actions) 41 | 42 | # Compute target values 43 | with chainer.no_backprop_mode(): 44 | 45 | target_qout = self.target_q_function(batch_state) 46 | 47 | batch_next_state = exp_batch['next_state'] 48 | 49 | with state_kept(self.target_q_function): 50 | target_next_qout = self.target_q_function( 51 | batch_next_state) 52 | next_q_max = F.reshape(target_next_qout.max, (batch_size,)) 53 | 54 | batch_rewards = exp_batch['reward'] 55 | batch_terminal = exp_batch['is_state_terminal'] 56 | 57 | # T Q: Bellman operator 58 | t_q = batch_rewards + exp_batch['discount'] * \ 59 | (1.0 - batch_terminal) * next_q_max 60 | 61 | # T_PAL Q: persistent advantage learning operator 62 | cur_advantage = F.reshape( 63 | target_qout.compute_advantage(batch_actions), (batch_size,)) 64 | next_advantage = F.reshape( 65 | target_next_qout.compute_advantage(batch_actions), 66 | (batch_size,)) 67 | tpal_q = t_q + self.alpha * \ 68 | F.maximum(cur_advantage, next_advantage) 69 | 70 | return batch_q, tpal_q 71 | 72 | def input_initial_batch_to_target_model(self, batch): 73 | pass 74 | -------------------------------------------------------------------------------- /chainerrl/__init__.py: -------------------------------------------------------------------------------- 1 | from chainerrl import action_value # NOQA 2 | from chainerrl import agent # NOQA 3 | from chainerrl import agents # NOQA 4 | from chainerrl import distribution # NOQA 5 | from chainerrl import env # NOQA 6 | from chainerrl import envs # NOQA 7 | from chainerrl import experiments # NOQA 8 | from chainerrl import explorer # NOQA 9 | from chainerrl import explorers # NOQA 10 | from chainerrl import functions # NOQA 11 | from chainerrl import links # NOQA 12 | from chainerrl import misc # NOQA 13 | from chainerrl import optimizers # NOQA 14 | from chainerrl import policies # NOQA 15 | from chainerrl import policy # NOQA 16 | from chainerrl import q_function # NOQA 17 | from chainerrl import q_functions # NOQA 18 | from chainerrl import recurrent # NOQA 19 | from chainerrl import replay_buffer # NOQA 20 | from chainerrl import v_function # NOQA 21 | from chainerrl import v_functions # NOQA 22 | from chainerrl import wrappers # NOQA 23 | 24 | # For backward compatibility while avoiding circular import 25 | policy.SoftmaxPolicy = policies.SoftmaxPolicy 26 | policy.FCSoftmaxPolicy = policies.FCSoftmaxPolicy 27 | policy.ContinuousDeterministicPolicy = policies.ContinuousDeterministicPolicy 28 | policy.FCDeterministicPolicy = policies.FCDeterministicPolicy 29 | policy.FCBNDeterministicPolicy = policies.FCBNDeterministicPolicy 30 | policy.FCLSTMDeterministicPolicy = policies.FCLSTMDeterministicPolicy 31 | policy.FCGaussianPolicy = policies.FCGaussianPolicy 32 | policy.MellowmaxPolicy = policies.MellowmaxPolicy 33 | 34 | q_function.DuelingDQN = q_functions.DuelingDQN 35 | q_function.SingleModelStateActionQFunction = \ 36 | q_functions.SingleModelStateActionQFunction 37 | q_function.FCSAQFunction = q_functions.FCSAQFunction 38 | q_function.FCLSTMSAQFunction = q_functions.FCLSTMSAQFunction 39 | q_function.FCBNSAQFunction = q_functions.FCBNSAQFunction 40 | q_function.FCBNLateActionSAQFunction = q_functions.FCBNLateActionSAQFunction 41 | q_function.FCLateActionSAQFunction = q_functions.FCLateActionSAQFunction 42 | q_function.SingleModelStateActionQFunction = \ 43 | q_functions.SingleModelStateActionQFunction 44 | q_function.FCStateQFunctionWithDiscreteAction = \ 45 | q_functions.FCStateQFunctionWithDiscreteAction 46 | q_function.FCLSTMStateQFunction = q_functions.FCLSTMStateQFunction 47 | q_function.FCQuadraticStateQFunction = q_functions.FCQuadraticStateQFunction 48 | q_function.FCBNQuadraticStateQFunction = \ 49 | q_functions.FCBNQuadraticStateQFunction 50 | 51 | v_function.SingleModelVFunction = v_functions.SingleModelVFunction 52 | v_function.FCVFunction = v_functions.FCVFunction 53 | -------------------------------------------------------------------------------- /chainerrl/links/noisy_linear.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import chainer.functions as F 3 | from chainer.initializers import LeCunUniform 4 | import chainer.links as L 5 | import numpy 6 | 7 | from chainerrl.initializers import VarianceScalingConstant 8 | 9 | 10 | class FactorizedNoisyLinear(chainer.Chain): 11 | """Linear layer in Factorized Noisy Network 12 | 13 | Args: 14 | mu_link (L.Linear): Linear link that computes mean of output. 15 | sigma_scale (float): The hyperparameter sigma_0 in the original paper. 16 | Scaling factor of the initial weights of noise-scaling parameters. 17 | """ 18 | 19 | def __init__(self, mu_link, sigma_scale=0.4): 20 | super(FactorizedNoisyLinear, self).__init__() 21 | self.out_size = mu_link.out_size 22 | self.nobias = not ('/b' in [name for name, _ in mu_link.namedparams()]) 23 | 24 | W_data = mu_link.W.array 25 | in_size = None if W_data is None else W_data.shape[1] 26 | device_id = mu_link._device_id 27 | 28 | with self.init_scope(): 29 | self.mu = L.Linear(in_size, self.out_size, self.nobias, 30 | initialW=LeCunUniform(1 / numpy.sqrt(3))) 31 | 32 | self.sigma = L.Linear(in_size, self.out_size, self.nobias, 33 | initialW=VarianceScalingConstant( 34 | sigma_scale), 35 | initial_bias=VarianceScalingConstant( 36 | sigma_scale)) 37 | 38 | if device_id is not None: 39 | self.to_gpu(device_id) 40 | 41 | def _eps(self, shape, dtype): 42 | xp = self.xp 43 | r = xp.random.standard_normal(shape).astype(dtype) 44 | 45 | # apply the function f 46 | return xp.copysign(xp.sqrt(xp.abs(r)), r) 47 | 48 | def __call__(self, x): 49 | if self.mu.W.array is None: 50 | self.mu.W.initialize((self.out_size, numpy.prod(x.shape[1:]))) 51 | if self.sigma.W.array is None: 52 | self.sigma.W.initialize((self.out_size, numpy.prod(x.shape[1:]))) 53 | 54 | # use info of sigma.W to avoid strange error messages 55 | dtype = self.sigma.W.dtype 56 | out_size, in_size = self.sigma.W.shape 57 | 58 | eps_x = self._eps(in_size, dtype) 59 | eps_y = self._eps(out_size, dtype) 60 | W = self.mu.W + self.sigma.W * self.xp.outer(eps_y, eps_x) 61 | if self.nobias: 62 | return F.linear(x, W) 63 | else: 64 | b = self.mu.b + self.sigma.b * eps_y 65 | return F.linear(x, W, b) 66 | -------------------------------------------------------------------------------- /chainerrl/functions/mellowmax.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | 6 | import chainer 7 | from chainer import functions as F 8 | import numpy as np 9 | import scipy.optimize 10 | 11 | 12 | def mellowmax(values, omega=1., axis=1): 13 | """Mellowmax function. 14 | 15 | This is a kind of softmax function that is, unlike the Boltzmann softmax, 16 | non-expansion. 17 | 18 | See: http://arxiv.org/abs/1612.05628 19 | 20 | Args: 21 | values (Variable or ndarray): 22 | Input values. Mellowmax is taken along the second axis. 23 | omega (float): 24 | Parameter of mellowmax. 25 | axis (int): 26 | Axis along which mellowmax is taken. 27 | Returns: 28 | outputs (Variable) 29 | """ 30 | n = values.shape[axis] 31 | return (F.logsumexp(omega * values, axis=axis) - np.log(n)) / omega 32 | 33 | 34 | def maximum_entropy_mellowmax(values, omega=1., beta_min=-10, beta_max=10): 35 | """Maximum entropy mellowmax policy function. 36 | 37 | This function provides a categorical distribution whose expectation matches 38 | the one of mellowmax function while maximizing its entropy. 39 | 40 | See: http://arxiv.org/abs/1612.05628 41 | 42 | Args: 43 | values (Variable or ndarray): 44 | Input values. Mellowmax is taken along the second axis. 45 | omega (float): 46 | Parameter of mellowmax. 47 | beta_min (float): 48 | Minimum value of beta, used in Brent's algorithm. 49 | beta_max (float): 50 | Maximum value of beta, used in Brent's algorithm. 51 | Returns: 52 | outputs (Variable) 53 | """ 54 | xp = chainer.cuda.get_array_module(values) 55 | mm = mellowmax(values, axis=1) 56 | 57 | # Advantage: Q - mellowmax(Q) 58 | batch_adv = values - F.broadcast_to(F.expand_dims(mm, 1), values.shape) 59 | # Move data to CPU because we use Brent's algorithm in scipy 60 | batch_adv = chainer.cuda.to_cpu(batch_adv.array) 61 | batch_beta = np.empty(mm.shape, dtype=np.float32) 62 | 63 | # Beta is computed as the root of this function 64 | def f(y, adv): 65 | return np.sum(np.exp(y * adv) * adv) 66 | 67 | for idx in np.ndindex(mm.shape): 68 | idx_full = idx[:1] + (slice(None),) + idx[1:] 69 | adv = batch_adv[idx_full] 70 | try: 71 | beta = scipy.optimize.brentq( 72 | f, a=beta_min, b=beta_max, args=(adv,)) 73 | except ValueError: 74 | beta = 0 75 | batch_beta[idx] = beta 76 | 77 | return F.softmax(xp.expand_dims(xp.asarray(batch_beta), 1) * values) 78 | -------------------------------------------------------------------------------- /tests/functions_tests/test_lower_triangular_matrix.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | import unittest 8 | 9 | import numpy 10 | 11 | import chainer 12 | from chainer import cuda 13 | from chainer import gradient_check 14 | from chainer import testing 15 | from chainer.testing import attr 16 | 17 | from chainerrl.functions.lower_triangular_matrix import lower_triangular_matrix 18 | from chainerrl.functions.lower_triangular_matrix import LowerTriangularMatrix 19 | 20 | 21 | @testing.parameterize( 22 | {'n': 1}, 23 | {'n': 2}, 24 | {'n': 3}, 25 | {'n': 4}, 26 | {'n': 5}, 27 | ) 28 | class TestLowerTriangularMatrix(unittest.TestCase): 29 | 30 | def setUp(self): 31 | self.batch_size = 5 32 | self.diag = numpy.random.uniform( 33 | 0.1, 1, (self.batch_size, self.n)).astype(numpy.float32) 34 | non_diag_size = self.n * (self.n - 1) // 2 35 | self.non_diag = numpy.random.uniform( 36 | -1, 1, (self.batch_size, non_diag_size)).astype(numpy.float32) 37 | self.gy = numpy.random.uniform( 38 | -1, 1, (self.batch_size, self.n, self.n)).astype(numpy.float32) 39 | 40 | def check_forward(self, diag_data, non_diag_data): 41 | diag = chainer.Variable(diag_data) 42 | non_diag = chainer.Variable(non_diag_data) 43 | y = lower_triangular_matrix(diag, non_diag) 44 | 45 | correct_y = numpy.zeros( 46 | (self.batch_size, self.n, self.n), dtype=numpy.float32) 47 | 48 | tril_rows, tril_cols = numpy.tril_indices(self.n, -1) 49 | correct_y[:, tril_rows, tril_cols] = cuda.to_cpu(non_diag_data) 50 | 51 | diag_rows, diag_cols = numpy.diag_indices(self.n) 52 | correct_y[:, diag_rows, diag_cols] = cuda.to_cpu(diag_data) 53 | 54 | gradient_check.assert_allclose(correct_y, cuda.to_cpu(y.array)) 55 | 56 | def test_forward_cpu(self): 57 | self.check_forward(self.diag, self.non_diag) 58 | 59 | @attr.gpu 60 | def test_forward_gpu(self): 61 | self.check_forward(cuda.to_gpu(self.diag), cuda.to_gpu(self.non_diag)) 62 | 63 | def check_backward(self, x_data, y_grad): 64 | gradient_check.check_backward( 65 | LowerTriangularMatrix(), 66 | x_data, y_grad, eps=1e-2, rtol=1e-2) 67 | 68 | def test_backward_cpu(self): 69 | self.check_backward((self.diag, self.non_diag), self.gy) 70 | 71 | @attr.gpu 72 | def test_backward_gpu(self): 73 | self.check_backward((cuda.to_gpu(self.diag), cuda.to_gpu( 74 | self.non_diag)), cuda.to_gpu(self.gy)) 75 | 76 | 77 | testing.run_module(__name__, __file__) 78 | -------------------------------------------------------------------------------- /chainerrl/optimizers/rmsprop_async.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from chainer import cuda 10 | from chainer import optimizer 11 | import numpy 12 | 13 | 14 | _default_hyperparam = optimizer.Hyperparameter() 15 | _default_hyperparam.lr = 0.01 16 | _default_hyperparam.alpha = 0.99 17 | _default_hyperparam.eps = 1e-8 18 | 19 | 20 | class RMSpropAsyncRule(optimizer.UpdateRule): 21 | 22 | def __init__(self, parent_hyperparam=None, lr=None, alpha=None, eps=None): 23 | super(RMSpropAsyncRule, self).__init__( 24 | parent_hyperparam or _default_hyperparam) 25 | if lr is not None: 26 | self.hyperparam.lr = lr 27 | if alpha is not None: 28 | self.hyperparam.alpha = alpha 29 | if eps is not None: 30 | self.hyperparam.eps = eps 31 | 32 | def init_state(self, param): 33 | xp = cuda.get_array_module(param.array) 34 | with cuda.get_device_from_array(param.array): 35 | self.state['ms'] = xp.zeros_like(param.array) 36 | 37 | def update_core_cpu(self, param): 38 | grad = param.grad 39 | if grad is None: 40 | return 41 | hp = self.hyperparam 42 | ms = self.state['ms'] 43 | 44 | ms *= hp.alpha 45 | ms += (1 - hp.alpha) * grad * grad 46 | param.array -= hp.lr * grad / numpy.sqrt(ms + hp.eps) 47 | 48 | def update_core_gpu(self, param): 49 | grad = param.grad 50 | if grad is None: 51 | return 52 | cuda.elementwise( 53 | 'T grad, T lr, T alpha, T eps', 54 | 'T param, T ms', 55 | '''ms = alpha * ms + (1 - alpha) * grad * grad; 56 | param -= lr * grad / sqrt(ms + eps);''', 57 | 'rmsprop')(grad, self.hyperparam.lr, self.hyperparam.alpha, 58 | self.hyperparam.eps, param.array, self.state['ms']) 59 | 60 | 61 | class RMSpropAsync(optimizer.GradientMethod): 62 | 63 | """RMSprop for asynchronous methods. 64 | 65 | The only difference from chainer.optimizers.RMSprop in that the epsilon is 66 | outside the square root. 67 | """ 68 | 69 | def __init__(self, lr=_default_hyperparam.lr, 70 | alpha=_default_hyperparam.alpha, eps=_default_hyperparam.eps): 71 | super(RMSpropAsync, self).__init__() 72 | self.hyperparam.lr = lr 73 | self.hyperparam.alpha = alpha 74 | self.hyperparam.eps = eps 75 | 76 | lr = optimizer.HyperparameterProxy('lr') 77 | alpha = optimizer.HyperparameterProxy('alpha') 78 | eps = optimizer.HyperparameterProxy('eps') 79 | 80 | def create_update_rule(self): 81 | return RMSpropAsyncRule(self.hyperparam) 82 | -------------------------------------------------------------------------------- /tests/functions_tests/test_invert_gradients.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | 6 | import unittest 7 | 8 | import chainer 9 | from chainer import cuda 10 | from chainer import functions 11 | from chainer import testing 12 | from chainer.testing import attr 13 | from chainer.testing import condition 14 | import numpy 15 | 16 | from chainerrl.functions.invert_gradients import invert_gradients 17 | 18 | 19 | @testing.parameterize(*testing.product({ 20 | 'shape': [(), (1, 1), (2, 3), (2, 3, 4), (2, 3, 4, 5)], 21 | 'dtype': [numpy.float32], 22 | })) 23 | class TestInvertGradients(unittest.TestCase): 24 | 25 | def setUp(self): 26 | self.x = numpy.random.uniform(-1, 1, self.shape).astype(self.dtype) 27 | 28 | def check_forward(self, x_data): 29 | 30 | # In chainer, update will be like x.array -= lr * x.grad, 31 | # which means negative gradients will increase values. 32 | 33 | # Not exceeding 34 | range_max = x_data + 0.1 35 | range_min = x_data - 0.1 36 | x = chainer.Variable(x_data) 37 | y = invert_gradients(x, range_min=range_min, range_max=range_max) 38 | 39 | loss = functions.sum(y) # Minimize y 40 | loss.backward() 41 | self.assertTrue((x.grad > 0).all()) # Decrease x 42 | x.cleargrad() 43 | 44 | loss = -functions.sum(y) # Maximize y 45 | loss.backward() 46 | self.assertTrue((x.grad < 0).all()) # Increase x 47 | x.cleargrad() 48 | 49 | # Exceeding range_max 50 | range_max = x_data - 0.1 51 | range_min = x_data - 0.2 52 | y = invert_gradients(x, range_min=range_min, range_max=range_max) 53 | 54 | loss = functions.sum(y) # Minimize y 55 | loss.backward() 56 | self.assertTrue((x.grad > 0).all()) # Decrease x 57 | x.cleargrad() 58 | 59 | loss = -functions.sum(y) # Maximize y 60 | loss.backward() 61 | self.assertTrue((x.grad > 0).all()) # Decrease x 62 | x.cleargrad() 63 | 64 | # Exceeding range_min 65 | range_max = x_data + 0.2 66 | range_min = x_data + 0.1 67 | y = invert_gradients(x, range_min=range_min, range_max=range_max) 68 | 69 | loss = functions.sum(y) # Minimize y 70 | loss.backward() 71 | self.assertTrue((x.grad < 0).all()) # Increase x 72 | x.cleargrad() 73 | 74 | loss = -functions.sum(y) # Maximize y 75 | loss.backward() 76 | self.assertTrue((x.grad < 0).all()) # Increase x 77 | x.cleargrad() 78 | 79 | @condition.retry(3) 80 | def test_forward_cpu(self): 81 | self.check_forward(self.x) 82 | 83 | @attr.gpu 84 | @condition.retry(3) 85 | def test_forward_gpu(self): 86 | self.check_forward(cuda.to_gpu(self.x)) 87 | 88 | 89 | testing.run_module(__name__, __file__) 90 | -------------------------------------------------------------------------------- /tests/misc_tests/test_collections.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import collections 10 | import unittest 11 | 12 | from chainer import testing 13 | 14 | from chainerrl.misc.collections import RandomAccessQueue 15 | 16 | 17 | @testing.parameterize(*( 18 | testing.product({ 19 | 'maxlen': [1, 10, None], 20 | 'init_seq': [None, [], range(5)], 21 | }) 22 | )) 23 | class TestRandomAccessQueue(unittest.TestCase): 24 | def setUp(self): 25 | if self.init_seq: 26 | self.y_queue = RandomAccessQueue(self.init_seq, maxlen=self.maxlen) 27 | self.t_queue = collections.deque(self.init_seq, maxlen=self.maxlen) 28 | else: 29 | self.y_queue = RandomAccessQueue(maxlen=self.maxlen) 30 | self.t_queue = collections.deque(maxlen=self.maxlen) 31 | 32 | def test1(self): 33 | self.check_all() 34 | 35 | self.check_popleft() 36 | self.do_append(10) 37 | self.check_all() 38 | 39 | self.check_popleft() 40 | self.check_popleft() 41 | self.do_append(11) 42 | self.check_all() 43 | 44 | # test negative indices 45 | n = len(self.t_queue) 46 | for i in range(-n, 0): 47 | self.check_getitem(i) 48 | 49 | for k in range(4): 50 | self.do_extend(range(k)) 51 | self.check_all() 52 | 53 | for k in range(4): 54 | self.check_popleft() 55 | self.do_extend(range(k)) 56 | self.check_all() 57 | 58 | for k in range(10): 59 | self.do_append(20 + k) 60 | self.check_popleft() 61 | self.check_popleft() 62 | self.check_all() 63 | 64 | for _ in range(100): 65 | self.check_popleft() 66 | 67 | def check_all(self): 68 | self.check_len() 69 | n = len(self.t_queue) 70 | for i in range(n): 71 | self.check_getitem(i) 72 | 73 | def check_len(self): 74 | self.assertEqual(len(self.y_queue), len(self.t_queue)) 75 | 76 | def check_getitem(self, i): 77 | self.assertEqual(self.y_queue[i], self.t_queue[i]) 78 | 79 | def do_setitem(self, i, x): 80 | self.y_queue[i] = x 81 | self.t_queue[i] = x 82 | 83 | def do_append(self, x): 84 | self.y_queue.append(x) 85 | self.t_queue.append(x) 86 | 87 | def do_extend(self, xs): 88 | self.y_queue.extend(xs) 89 | self.t_queue.extend(xs) 90 | 91 | def check_popleft(self): 92 | try: 93 | t = self.t_queue.popleft() 94 | except IndexError: 95 | with self.assertRaises(IndexError): 96 | self.y_queue.popleft() 97 | else: 98 | self.assertEqual(self.y_queue.popleft(), t) 99 | -------------------------------------------------------------------------------- /tests/wrappers_tests/test_randomize_action.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import unittest 10 | 11 | from chainer import testing 12 | from chainer.testing import condition 13 | import gym 14 | import gym.spaces 15 | 16 | import chainerrl 17 | 18 | 19 | class ActionRecordingEnv(gym.Env): 20 | 21 | observation_space = gym.spaces.Box(low=-1, high=1, shape=(1,)) 22 | action_space = gym.spaces.Discrete(3) 23 | 24 | def __init__(self): 25 | self.past_actions = [] 26 | 27 | def reset(self): 28 | return self.observation_space.sample() 29 | 30 | def step(self, action): 31 | self.past_actions.append(action) 32 | return self.observation_space.sample(), 0, False, {} 33 | 34 | 35 | @testing.parameterize(*testing.product({ 36 | 'random_fraction': [0, 0.3, 0.6, 1], 37 | })) 38 | class TestRandomizeAction(unittest.TestCase): 39 | 40 | @condition.retry(3) 41 | def test_action_ratio(self): 42 | random_fraction = self.random_fraction 43 | env = ActionRecordingEnv() 44 | env = chainerrl.wrappers.RandomizeAction( 45 | env, random_fraction=random_fraction) 46 | env.reset() 47 | n = 1000 48 | delta = 0.05 49 | for _ in range(n): 50 | # Always send action 0 51 | env.step(0) 52 | # Ratio of selected actions should be: 53 | # 0: (1 - random_fraction) + random_fraction/3 54 | # 1: random_fraction/3 55 | # 2: random_fraction/3 56 | self.assertAlmostEqual( 57 | env.env.past_actions.count(0) / n, 58 | (1 - random_fraction) + random_fraction / 3, delta=delta) 59 | self.assertAlmostEqual( 60 | env.env.past_actions.count(1) / n, 61 | random_fraction / 3, delta=delta) 62 | self.assertAlmostEqual( 63 | env.env.past_actions.count(2) / n, 64 | random_fraction / 3, delta=delta) 65 | 66 | @condition.retry(3) 67 | def test_seed(self): 68 | 69 | def get_actions(seed): 70 | random_fraction = self.random_fraction 71 | env = ActionRecordingEnv() 72 | env = chainerrl.wrappers.RandomizeAction( 73 | env, random_fraction=random_fraction) 74 | env.seed(seed) 75 | for _ in range(1000): 76 | # Always send action 0 77 | env.step(0) 78 | return env.env.past_actions 79 | 80 | a_seed0 = get_actions(0) 81 | a_seed1 = get_actions(1) 82 | b_seed0 = get_actions(0) 83 | b_seed1 = get_actions(1) 84 | 85 | self.assertEqual(a_seed0, b_seed0) 86 | self.assertEqual(a_seed1, b_seed1) 87 | if self.random_fraction > 0: 88 | self.assertNotEqual(a_seed0, a_seed1) 89 | -------------------------------------------------------------------------------- /chainerrl/misc/copy_param.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from chainer import links as L 10 | 11 | 12 | def copy_param(target_link, source_link): 13 | """Copy parameters of a link to another link.""" 14 | target_params = dict(target_link.namedparams()) 15 | for param_name, param in source_link.namedparams(): 16 | if target_params[param_name].array is None: 17 | raise TypeError( 18 | 'target_link parameter {} is None. Maybe the model params are ' 19 | 'not initialized.\nPlease try to forward dummy input ' 20 | 'beforehand to determine parameter shape of the model.'.format( 21 | param_name)) 22 | target_params[param_name].array[:] = param.array 23 | 24 | # Copy Batch Normalization's statistics 25 | target_links = dict(target_link.namedlinks()) 26 | for link_name, link in source_link.namedlinks(): 27 | if isinstance(link, L.BatchNormalization): 28 | target_bn = target_links[link_name] 29 | target_bn.avg_mean[:] = link.avg_mean 30 | target_bn.avg_var[:] = link.avg_var 31 | 32 | 33 | def soft_copy_param(target_link, source_link, tau): 34 | """Soft-copy parameters of a link to another link.""" 35 | target_params = dict(target_link.namedparams()) 36 | for param_name, param in source_link.namedparams(): 37 | if target_params[param_name].array is None: 38 | raise TypeError( 39 | 'target_link parameter {} is None. Maybe the model params are ' 40 | 'not initialized.\nPlease try to forward dummy input ' 41 | 'beforehand to determine parameter shape of the model.'.format( 42 | param_name)) 43 | target_params[param_name].array[:] *= (1 - tau) 44 | target_params[param_name].array[:] += tau * param.array 45 | 46 | # Soft-copy Batch Normalization's statistics 47 | target_links = dict(target_link.namedlinks()) 48 | for link_name, link in source_link.namedlinks(): 49 | if isinstance(link, L.BatchNormalization): 50 | target_bn = target_links[link_name] 51 | target_bn.avg_mean[:] *= (1 - tau) 52 | target_bn.avg_mean[:] += tau * link.avg_mean 53 | target_bn.avg_var[:] *= (1 - tau) 54 | target_bn.avg_var[:] += tau * link.avg_var 55 | 56 | 57 | def copy_grad(target_link, source_link): 58 | """Copy gradients of a link to another link.""" 59 | target_params = dict(target_link.namedparams()) 60 | for param_name, param in source_link.namedparams(): 61 | target_params[param_name].grad[:] = param.grad 62 | 63 | 64 | def synchronize_parameters(src, dst, method, tau=None): 65 | {'hard': lambda: copy_param(dst, src), 66 | 'soft': lambda: soft_copy_param(dst, src, tau), 67 | }[method]() 68 | -------------------------------------------------------------------------------- /tests/links_tests/test_noisy_linear.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import chainer 4 | from chainer import cuda 5 | from chainer import testing 6 | from chainer.testing import attr 7 | from chainer.testing import condition 8 | import numpy 9 | 10 | from chainerrl.links import noisy_linear 11 | 12 | 13 | @testing.parameterize(*testing.product({ 14 | 'size_args': [ 15 | (5,), # uninitialized from Chainer v2 16 | (None, 5), # uninitialized 17 | (6, 5), # initialized 18 | ], 19 | 'nobias': [False, True], 20 | })) 21 | class TestFactorizedNoisyLinear(unittest.TestCase): 22 | def setUp(self): 23 | mu = chainer.links.Linear(*self.size_args, nobias=self.nobias) 24 | self.linear = noisy_linear.FactorizedNoisyLinear(mu) 25 | 26 | def _test_calls(self, xp): 27 | x_data = xp.arange(12).astype(numpy.float32).reshape((2, 6)) 28 | x = chainer.Variable(x_data) 29 | self.linear(x) 30 | self.linear(x_data + 1) 31 | self.linear(x_data.reshape((2, 3, 2))) 32 | 33 | def test_calls_cpu(self): 34 | self._test_calls(numpy) 35 | 36 | @attr.gpu 37 | def test_calls_gpu(self): 38 | self.linear.to_gpu(0) 39 | self._test_calls(cuda.cupy) 40 | 41 | @attr.gpu 42 | def test_calls_gpu_after_to_gpu(self): 43 | mu = self.linear.mu 44 | mu.to_gpu(0) 45 | self.linear = noisy_linear.FactorizedNoisyLinear(mu) 46 | self._test_calls(cuda.cupy) 47 | 48 | def _test_randomness(self, xp): 49 | x = xp.random.standard_normal((10, 6)).astype(numpy.float32) 50 | y1 = self.linear(x).array 51 | y2 = self.linear(x).array 52 | d = float(xp.mean(xp.square(y1 - y2))) 53 | 54 | # The parameter name suggests that 55 | # xp.sqrt(d / 2) is approx to sigma_scale = 0.4 56 | # In fact, (for each element _[i, j],) it holds: 57 | # \E[(y2 - y1) ** 2] = 2 * \Var(y) = (4 / pi) * sigma_scale ** 2 58 | 59 | target = (0.4 ** 2) * 2 60 | if self.nobias: 61 | target *= 2 / numpy.pi 62 | else: 63 | target *= 2 / numpy.pi + numpy.sqrt(2 / numpy.pi) / y1.shape[1] 64 | 65 | self.assertGreater(d, target / 3.) 66 | self.assertLess(d, target * 3.) 67 | 68 | @condition.retry(3) 69 | def test_randomness_cpu(self): 70 | self._test_randomness(numpy) 71 | 72 | @attr.gpu 73 | @condition.retry(3) 74 | def test_randomness_gpu(self): 75 | self.linear.to_gpu(0) 76 | self._test_randomness(cuda.cupy) 77 | 78 | def _test_non_randomness(self, xp): 79 | # Noises should be the same in a batch 80 | x0 = xp.random.standard_normal((1, 6)).astype(numpy.float32) 81 | x = xp.broadcast_to(x0, (2, 6)) 82 | y = self.linear(x).array 83 | xp.testing.assert_allclose(y[0], y[1], rtol=1e-4) 84 | 85 | def test_non_randomness_cpu(self): 86 | self._test_non_randomness(numpy) 87 | 88 | @attr.gpu 89 | def test_non_randomness_gpu(self): 90 | self.linear.to_gpu(0) 91 | self._test_non_randomness(cuda.cupy) 92 | -------------------------------------------------------------------------------- /examples/atari/dqn/README.md: -------------------------------------------------------------------------------- 1 | # DQN 2 | This example trains a DQN agent, from the following paper: [Human-level control through Deep Reinforcement Learning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf). 3 | 4 | ## Requirements 5 | 6 | - atari_py>=0.1.1 7 | - opencv-python 8 | 9 | ## Running the Example 10 | 11 | ``` 12 | python train_dqn.py [options] 13 | ``` 14 | 15 | ### Useful Options 16 | - `--gpu`. Specifies the GPU. If you do not have a GPU on your machine, run the example with the option `--gpu -1`. E.g. `python train_dqn.py --gpu -1`. 17 | - `--env`. Specifies the environment. 18 | - `--render`. Add this option to render the states in a GUI window. 19 | - `--seed`. This option specifies the random seed used. 20 | - `--outdir` This option specifies the output directory to which the results are written. 21 | 22 | To view the full list of options, either view the code or run the example with the `--help` option. 23 | 24 | ## Results 25 | These results reflect ChainerRL `v0.5.0`. 26 | 27 | | Game | Score | Reported Scores | 28 | | ------------- |:-------------:|:-------------:| 29 | | AirRaid | N/A| N/A| 30 | | Alien | N/A| **3069**| 31 | | Amidar | N/A| **739.5**| 32 | | Assault | N/A| **3359**| 33 | | Asterix | N/A| **6012**| 34 | | Asteroids | N/A| **1629**| 35 | | Atlantis | N/A| **85641**| 36 | | Bank Heist | N/A| **429.7**| 37 | | Battlezone | N/A| **26300**| 38 | | Beamrider | N/A| **6846**| 39 | | Berzerk | N/A| N/A| 40 | | Bowling | N/A| **42.4**| 41 | | Boxing | N/A| **71.8**| 42 | | Breakout | N/A| **401.2**| 43 | | Carnival | N/A| N/A| 44 | | Centipede | N/A| **8309**| 45 | | Chopper Command | N/A| **6687**| 46 | | Crazy Climber | N/A| **114103**| 47 | | Demon Attack | N/A| **9711**| 48 | | Double Dunk | N/A| **-18.1**| 49 | | Elevator Action | N/A| N/A| 50 | | Enduro | N/A| **301.8**| 51 | | Fishing Derby | N/A| **-0.8**| 52 | | Freeway | N/A| **30.3**| 53 | | Frostbite | N/A| **328.3**| 54 | | Gopher | N/A| **8520**| 55 | | Gravitar | N/A| **306.7**| 56 | | H.E.R.O. | N/A| **19950**| 57 | | Ice Hockey | N/A| **-1.6**| 58 | | James Bond 007 | N/A| **576.7**| 59 | | Journey Escape | N/A| N/A| 60 | | Kangaroo | N/A| **6740**| 61 | | Krull | N/A| **3805**| 62 | | Kung-Fu Master | N/A| **23270**| 63 | | Montezuma's Revenge | N/A| **0**| 64 | | Ms. Pac-Man | N/A| **2311**| 65 | | Name This Game | N/A| **7257**| 66 | | Phoenix | N/A| N/A| 67 | | Pitfall II | N/A| N/A| 68 | | Pitfall! | N/A| N/A| 69 | | Pong | N/A| **18.9**| 70 | | Pooyan | N/A| N/A| 71 | | Private Eye | N/A| **1788**| 72 | | Qbert | N/A| **10596**| 73 | | River Raid | N/A| **8316**| 74 | | Road Runner | N/A| **18257**| 75 | | Robot Tank | N/A| **51.6**| 76 | | Seaquest | N/A| **5286**| 77 | | Skiing | N/A| N/A| 78 | | Solaris | N/A| N/A| 79 | | Space Invaders | N/A| **1976**| 80 | | Stargunner | N/A| **57997**| 81 | | Tennis | N/A| **-2.5**| 82 | | Time Pilot | N/A| **5947**| 83 | | Tutankham | N/A| **186.7**| 84 | | Up’n Down | N/A| **8456**| 85 | | Venture | N/A| **380.0**| 86 | | Video Pinball | N/A| **42684**| 87 | | WizardOfWor | N/A| **3393**| 88 | | YarsRevenge | N/A| N/A| 89 | | Zaxxon | N/A| **4977**| 90 | 91 | -------------------------------------------------------------------------------- /tests/test_ale.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import random 10 | import sys 11 | import tempfile 12 | import unittest 13 | 14 | import numpy as np 15 | from PIL import Image 16 | 17 | from chainerrl.envs import ale 18 | 19 | 20 | class TestALE(unittest.TestCase): 21 | 22 | def setUp(self): 23 | pass 24 | 25 | def test_state(self): 26 | env = ale.ALE('breakout') 27 | self.assertEqual(len(env.state), 4) 28 | for s in env.state: 29 | self.assertEqual(s.shape, (84, 84)) 30 | self.assertEqual(s.dtype, np.uint8) 31 | 32 | def test_episode(self): 33 | env = ale.ALE('breakout') 34 | self.assertFalse(env.is_terminal) 35 | last_state = env.state 36 | while not env.is_terminal: 37 | 38 | # test state 39 | self.assertEqual(len(env.state), 4) 40 | for s in env.state: 41 | self.assertEqual(s.shape, (84, 84)) 42 | self.assertEqual(s.dtype, np.uint8) 43 | 44 | print('state (sum)', sum(env.state).sum()) 45 | 46 | legal_actions = env.legal_actions 47 | print('legal_actions:', legal_actions) 48 | self.assertGreater(len(legal_actions), 0) 49 | a = random.randrange(len(legal_actions)) 50 | print('a', a) 51 | env.receive_action(a) 52 | if not env.is_terminal: 53 | np.testing.assert_array_equal( 54 | np.asarray(last_state[1:]), np.asarray(env.state[:3])) 55 | last_state = env.state 56 | 57 | def test_current_screen(self): 58 | env = ale.ALE('breakout') 59 | tempdir = tempfile.mkdtemp() 60 | print('tempdir: {}'.format(tempdir), file=sys.stderr) 61 | for episode in range(6): 62 | env.initialize() 63 | t = 0 64 | while not env.is_terminal: 65 | for i in range(4): 66 | screen = env.state[i] 67 | self.assertEqual(screen.dtype, np.uint8) 68 | img = Image.fromarray(screen, mode='L') 69 | filename = '{}/{}_{}_{}.bmp'.format( 70 | tempdir, str(episode).zfill(6), str(t).zfill(6), i) 71 | img.save(filename) 72 | legal_actions = env.legal_actions 73 | a = random.randrange(len(legal_actions)) 74 | env.receive_action(a) 75 | t += 1 76 | 77 | def test_reward(self): 78 | env = ale.ALE('pong') 79 | for episode in range(3): 80 | total_r = 0 81 | while not env.is_terminal: 82 | a = random.randrange(len(env.legal_actions)) 83 | env.receive_action(a) 84 | total_r += env.reward 85 | self.assertGreater(total_r, -22) 86 | self.assertLess(total_r, -15) 87 | env.initialize() 88 | 89 | def test_seed(self): 90 | ale.ALE('breakout', seed=0) 91 | ale.ALE('breakout', seed=2 ** 31 - 1) 92 | -------------------------------------------------------------------------------- /tests/links_tests/test_empirical_normalization.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import chainer 4 | from chainer import testing 5 | import numpy as np 6 | 7 | from chainerrl.links import empirical_normalization 8 | 9 | 10 | class TestEmpiricalNormalization(unittest.TestCase): 11 | def test_small_cpu(self): 12 | self._test_small(gpu=-1) 13 | 14 | @testing.attr.gpu 15 | def test_small_gpu(self): 16 | self._test_small(gpu=0) 17 | 18 | def _test_small(self, gpu): 19 | en = empirical_normalization.EmpiricalNormalization(10) 20 | if gpu >= 0: 21 | chainer.cuda.get_device_from_id(gpu).use() 22 | en.to_gpu() 23 | 24 | xp = en.xp 25 | 26 | xs = [] 27 | for t in range(10): 28 | x = xp.random.normal(loc=4, scale=2, size=(t + 3, 10)) 29 | en(x) 30 | xs.extend(list(x)) 31 | xs = xp.stack(xs) 32 | true_mean = xp.mean(xs, axis=0) 33 | true_std = xp.std(xs, axis=0) 34 | xp.testing.assert_allclose(en.mean, true_mean, rtol=1e-4) 35 | xp.testing.assert_allclose(en.std, true_std, rtol=1e-4) 36 | 37 | @testing.attr.slow 38 | def test_large(self): 39 | en = empirical_normalization.EmpiricalNormalization(10) 40 | for _ in range(10000): 41 | x = np.random.normal(loc=4, scale=2, size=(7, 10)) 42 | en(x) 43 | x = 2 * np.random.normal(loc=4, scale=2, size=(1, 10)) 44 | enx = en(x, update=False) 45 | 46 | np.testing.assert_allclose(en.mean, 4, rtol=1e-1) 47 | np.testing.assert_allclose(en.std, 2, rtol=1e-1) 48 | 49 | # Compare with the ground-truth normalization 50 | np.testing.assert_allclose((x - 4) / 2, enx, rtol=1e-1) 51 | 52 | # Test inverse 53 | np.testing.assert_allclose(x, en.inverse(enx), rtol=1e-4) 54 | 55 | def test_batch_axis(self): 56 | shape = (2, 3, 4) 57 | for batch_axis in range(3): 58 | en = empirical_normalization.EmpiricalNormalization( 59 | shape=shape[:batch_axis] + shape[batch_axis + 1:], 60 | batch_axis=batch_axis, 61 | ) 62 | for _ in range(10): 63 | x = np.random.rand(*shape) 64 | en(x) 65 | 66 | def test_until(self): 67 | en = empirical_normalization.EmpiricalNormalization(7, until=20) 68 | last_mean = None 69 | last_std = None 70 | for t in range(15): 71 | en(np.random.rand(2, 7) + t) 72 | 73 | if 1 <= t < 10: 74 | self.assertFalse(np.allclose(en.mean, last_mean, rtol=1e-4)) 75 | self.assertFalse(np.allclose(en.std, last_std, rtol=1e-4)) 76 | elif t >= 10: 77 | np.testing.assert_allclose(en.mean, last_mean, rtol=1e-4) 78 | np.testing.assert_allclose(en.std, last_std, rtol=1e-4) 79 | 80 | last_mean = en.mean 81 | last_std = en.std 82 | 83 | def test_mixed_inputs(self): 84 | en = empirical_normalization.EmpiricalNormalization(7) 85 | for t in range(5): 86 | y = en(np.random.rand(t + 1, 7)) 87 | self.assertIsInstance(y, np.ndarray) 88 | y = en(chainer.Variable(np.random.rand(t + 1, 7))) 89 | self.assertIsInstance(y, chainer.Variable) 90 | -------------------------------------------------------------------------------- /tests/experiments_tests/test_train_agent.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | import tempfile 9 | import unittest 10 | 11 | import mock 12 | 13 | import chainerrl 14 | 15 | 16 | class TestTrainAgent(unittest.TestCase): 17 | 18 | def test(self): 19 | 20 | outdir = tempfile.mkdtemp() 21 | 22 | agent = mock.Mock() 23 | env = mock.Mock() 24 | # Reaches the terminal state after five actions 25 | env.reset.side_effect = [('state', 0)] 26 | env.step.side_effect = [ 27 | (('state', 1), 0, False, {}), 28 | (('state', 2), 0, False, {}), 29 | (('state', 3), -0.5, False, {}), 30 | (('state', 4), 0, False, {}), 31 | (('state', 5), 1, True, {}), 32 | ] 33 | hook = mock.Mock() 34 | 35 | chainerrl.experiments.train_agent( 36 | agent=agent, 37 | env=env, 38 | steps=5, 39 | outdir=outdir, 40 | step_hooks=[hook]) 41 | 42 | self.assertEqual(agent.act_and_train.call_count, 5) 43 | self.assertEqual(agent.stop_episode_and_train.call_count, 1) 44 | 45 | self.assertEqual(env.reset.call_count, 1) 46 | self.assertEqual(env.step.call_count, 5) 47 | 48 | self.assertEqual(hook.call_count, 5) 49 | # A hook receives (env, agent, step) 50 | for i, call in enumerate(hook.call_args_list): 51 | args, kwargs = call 52 | self.assertEqual(args[0], env) 53 | self.assertEqual(args[1], agent) 54 | # step starts with 1 55 | self.assertEqual(args[2], i + 1) 56 | 57 | def test_needs_reset(self): 58 | 59 | outdir = tempfile.mkdtemp() 60 | 61 | agent = mock.Mock() 62 | env = mock.Mock() 63 | # First episode: 0 -> 1 -> 2 -> 3 (reset) 64 | # Second episode: 4 -> 5 -> 6 -> 7 (done) 65 | env.reset.side_effect = [('state', 0), ('state', 4)] 66 | env.step.side_effect = [ 67 | (('state', 1), 0, False, {}), 68 | (('state', 2), 0, False, {}), 69 | (('state', 3), 0, False, {'needs_reset': True}), 70 | (('state', 5), -0.5, False, {}), 71 | (('state', 6), 0, False, {}), 72 | (('state', 7), 1, True, {}), 73 | ] 74 | hook = mock.Mock() 75 | 76 | chainerrl.experiments.train_agent( 77 | agent=agent, 78 | env=env, 79 | steps=5, 80 | outdir=outdir, 81 | step_hooks=[hook]) 82 | 83 | self.assertEqual(agent.act_and_train.call_count, 5) 84 | self.assertEqual(agent.stop_episode_and_train.call_count, 2) 85 | 86 | self.assertEqual(env.reset.call_count, 2) 87 | self.assertEqual(env.step.call_count, 5) 88 | 89 | self.assertEqual(hook.call_count, 5) 90 | # A hook receives (env, agent, step) 91 | for i, call in enumerate(hook.call_args_list): 92 | args, kwargs = call 93 | self.assertEqual(args[0], env) 94 | self.assertEqual(args[1], agent) 95 | # step starts with 1 96 | self.assertEqual(args[2], i + 1) 97 | -------------------------------------------------------------------------------- /chainerrl/explorers/epsilon_greedy.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from logging import getLogger 10 | 11 | import numpy as np 12 | 13 | from chainerrl import explorer 14 | 15 | 16 | def select_action_epsilon_greedily(epsilon, random_action_func, 17 | greedy_action_func): 18 | if np.random.rand() < epsilon: 19 | return random_action_func(), False 20 | else: 21 | return greedy_action_func(), True 22 | 23 | 24 | class ConstantEpsilonGreedy(explorer.Explorer): 25 | """Epsilon-greedy with constant epsilon. 26 | 27 | Args: 28 | epsilon: epsilon used 29 | random_action_func: function with no argument that returns action 30 | logger: logger used 31 | """ 32 | 33 | def __init__(self, epsilon, random_action_func, 34 | logger=getLogger(__name__)): 35 | assert epsilon >= 0 and epsilon <= 1 36 | self.epsilon = epsilon 37 | self.random_action_func = random_action_func 38 | self.logger = logger 39 | 40 | def select_action(self, t, greedy_action_func, action_value=None): 41 | a, greedy = select_action_epsilon_greedily( 42 | self.epsilon, self.random_action_func, greedy_action_func) 43 | greedy_str = 'greedy' if greedy else 'non-greedy' 44 | self.logger.debug('t:%s a:%s %s', t, a, greedy_str) 45 | return a 46 | 47 | def __repr__(self): 48 | return 'ConstantEpsilonGreedy(epsilon={})'.format(self.epsilon) 49 | 50 | 51 | class LinearDecayEpsilonGreedy(explorer.Explorer): 52 | """Epsilon-greedy with linearyly decayed epsilon 53 | 54 | Args: 55 | start_epsilon: max value of epsilon 56 | end_epsilon: min value of epsilon 57 | decay_steps: how many steps it takes for epsilon to decay 58 | random_action_func: function with no argument that returns action 59 | logger: logger used 60 | """ 61 | 62 | def __init__(self, start_epsilon, end_epsilon, 63 | decay_steps, random_action_func, logger=getLogger(__name__)): 64 | assert start_epsilon >= 0 and start_epsilon <= 1 65 | assert end_epsilon >= 0 and end_epsilon <= 1 66 | assert decay_steps >= 0 67 | self.start_epsilon = start_epsilon 68 | self.end_epsilon = end_epsilon 69 | self.decay_steps = decay_steps 70 | self.random_action_func = random_action_func 71 | self.logger = logger 72 | self.epsilon = start_epsilon 73 | 74 | def compute_epsilon(self, t): 75 | if t > self.decay_steps: 76 | return self.end_epsilon 77 | else: 78 | epsilon_diff = self.end_epsilon - self.start_epsilon 79 | return self.start_epsilon + epsilon_diff * (t / self.decay_steps) 80 | 81 | def select_action(self, t, greedy_action_func, action_value=None): 82 | self.epsilon = self.compute_epsilon(t) 83 | a, greedy = select_action_epsilon_greedily( 84 | self.epsilon, self.random_action_func, greedy_action_func) 85 | greedy_str = 'greedy' if greedy else 'non-greedy' 86 | self.logger.debug('t:%s a:%s %s', t, a, greedy_str) 87 | return a 88 | 89 | def __repr__(self): 90 | return 'LinearDecayEpsilonGreedy(epsilon={})'.format(self.epsilon) 91 | -------------------------------------------------------------------------------- /tests/envs_tests/test_vector_envs.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import unittest 10 | 11 | from chainer import testing 12 | import gym 13 | import numpy as np 14 | 15 | import chainerrl 16 | 17 | 18 | @testing.parameterize(*testing.product({ 19 | 'num_envs': [1, 2, 3], 20 | 'env_id': ['CartPole-v0', 'Pendulum-v0'], 21 | 'random_seed_offset': [0, 100], 22 | 'vector_env_to_test': ['SerialVectorEnv', 'MultiprocessVectorEnv'], 23 | })) 24 | class TestSerialVectorEnv(unittest.TestCase): 25 | 26 | def setUp(self): 27 | # Init VectorEnv to test 28 | if self.vector_env_to_test == 'SerialVectorEnv': 29 | self.vec_env = chainerrl.envs.SerialVectorEnv( 30 | [gym.make(self.env_id) for _ in range(self.num_envs)]) 31 | elif self.vector_env_to_test == 'MultiprocessVectorEnv': 32 | self.vec_env = chainerrl.envs.MultiprocessVectorEnv( 33 | [(lambda: gym.make(self.env_id)) 34 | for _ in range(self.num_envs)]) 35 | else: 36 | assert False 37 | # Init envs to compare against 38 | self.envs = [gym.make(self.env_id) for _ in range(self.num_envs)] 39 | 40 | def tearDown(self): 41 | # Delete so that all the subprocesses are joined 42 | del self.vec_env 43 | 44 | def test_num_envs(self): 45 | self.assertEqual(self.vec_env.num_envs, self.num_envs) 46 | 47 | def test_action_space(self): 48 | self.assertEqual(self.vec_env.action_space, self.envs[0].action_space) 49 | 50 | def test_observation_space(self): 51 | self.assertEqual( 52 | self.vec_env.observation_space, self.envs[0].observation_space) 53 | 54 | def test_seed_reset_and_step(self): 55 | # seed 56 | seeds = [self.random_seed_offset + i for i in range(self.num_envs)] 57 | self.vec_env.seed(seeds) 58 | for env, seed in zip(self.envs, seeds): 59 | env.seed(seed) 60 | 61 | # reset 62 | obss = self.vec_env.reset() 63 | real_obss = [env.reset() for env in self.envs] 64 | np.testing.assert_allclose(obss, real_obss) 65 | 66 | # step 67 | actions = [env.action_space.sample() for env in self.envs] 68 | real_obss, real_rewards, real_dones, real_infos = zip(*[ 69 | env.step(action) for env, action in zip(self.envs, actions)]) 70 | obss, rewards, dones, infos = self.vec_env.step(actions) 71 | np.testing.assert_allclose(obss, real_obss) 72 | self.assertEqual(rewards, real_rewards) 73 | self.assertEqual(dones, real_dones) 74 | self.assertEqual(infos, real_infos) 75 | 76 | # reset with full mask should have no effect 77 | mask = np.ones(self.num_envs) 78 | obss = self.vec_env.reset(mask) 79 | np.testing.assert_allclose(obss, real_obss) 80 | 81 | # reset with partial mask 82 | mask = np.zeros(self.num_envs) 83 | mask[-1] = 1 84 | obss = self.vec_env.reset(mask) 85 | real_obss = list(real_obss) 86 | for i in range(self.num_envs): 87 | if not mask[i]: 88 | real_obss[i] = self.envs[i].reset() 89 | np.testing.assert_allclose(obss, real_obss) 90 | 91 | 92 | testing.run_module(__name__, __file__) 93 | -------------------------------------------------------------------------------- /tests/test_agent.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import os 10 | import tempfile 11 | import unittest 12 | 13 | import chainer 14 | import numpy as np 15 | 16 | import chainerrl 17 | 18 | 19 | def create_simple_link(): 20 | link = chainer.Link() 21 | with link.init_scope(): 22 | link.param = chainer.Parameter(np.zeros(1)) 23 | return link 24 | 25 | 26 | class Parent(chainerrl.agent.AttributeSavingMixin, object): 27 | 28 | saved_attributes = ['link', 'child'] 29 | 30 | def __init__(self): 31 | self.link = create_simple_link() 32 | self.child = Child() 33 | 34 | 35 | class Child(chainerrl.agent.AttributeSavingMixin, object): 36 | 37 | saved_attributes = ['link'] 38 | 39 | def __init__(self): 40 | self.link = create_simple_link() 41 | 42 | 43 | class Parent2(chainerrl.agent.AttributeSavingMixin, object): 44 | 45 | saved_attributes = ['child_a', 'child_b'] 46 | 47 | def __init__(self, child_a, child_b): 48 | self.child_a = child_a 49 | self.child_b = child_b 50 | 51 | 52 | class TestAttributeSavingMixin(unittest.TestCase): 53 | 54 | def test_save_load(self): 55 | parent = Parent() 56 | parent.link.param.array[:] = 1 57 | parent.child.link.param.array[:] = 2 58 | # Save 59 | dirname = tempfile.mkdtemp() 60 | parent.save(dirname) 61 | self.assertTrue(os.path.isdir(dirname)) 62 | self.assertTrue(os.path.isfile(os.path.join(dirname, 'link.npz'))) 63 | self.assertTrue(os.path.isdir(os.path.join(dirname, 'child'))) 64 | self.assertTrue(os.path.isfile( 65 | os.path.join(dirname, 'child', 'link.npz'))) 66 | # Load 67 | parent = Parent() 68 | self.assertEqual(int(parent.link.param.array), 0) 69 | self.assertEqual(int(parent.child.link.param.array), 0) 70 | parent.load(dirname) 71 | self.assertEqual(int(parent.link.param.array), 1) 72 | self.assertEqual(int(parent.child.link.param.array), 2) 73 | 74 | def test_save_load_2(self): 75 | parent = Parent() 76 | parent2 = Parent2(parent.child, parent) 77 | # Save 78 | dirname = tempfile.mkdtemp() 79 | parent2.save(dirname) 80 | # Load 81 | parent = Parent() 82 | parent2 = Parent2(parent.child, parent) 83 | parent2.load(dirname) 84 | 85 | def test_loop1(self): 86 | parent = Parent() 87 | parent.child = parent 88 | dirname = tempfile.mkdtemp() 89 | 90 | # The assertion in ChainerRL should fail on save(). 91 | # Otherwise it seems to raise OSError: [Errno 63] File name too long 92 | with self.assertRaises(AssertionError): 93 | parent.save(dirname) 94 | 95 | def test_loop2(self): 96 | parent1 = Parent() 97 | parent2 = Parent() 98 | parent1.child = parent2 99 | parent2.child = parent1 100 | dirname = tempfile.mkdtemp() 101 | 102 | # The assertion in ChainerRL should fail on save(). 103 | # Otherwise it seems to raise OSError: [Errno 63] File name too long 104 | with self.assertRaises(AssertionError): 105 | parent1.save(dirname) 106 | -------------------------------------------------------------------------------- /tests/misc_tests/test_draw_computational_graph.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | from __future__ import print_function 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from future import standard_library 6 | standard_library.install_aliases() # NOQA 7 | 8 | import os 9 | import tempfile 10 | import unittest 11 | 12 | import chainer 13 | from chainer import testing 14 | import numpy as np 15 | 16 | import chainerrl 17 | 18 | 19 | _v = chainer.Variable(np.zeros(5)) 20 | _dav = chainerrl.action_value.DiscreteActionValue( 21 | chainer.Variable(np.zeros((5, 5)))) 22 | _qav = chainerrl.action_value.QuadraticActionValue( 23 | chainer.Variable(np.zeros((5, 5), dtype=np.float32)), 24 | chainer.Variable(np.ones((5, 5, 5), dtype=np.float32)), 25 | chainer.Variable(np.zeros((5, 1), dtype=np.float32)), 26 | ) 27 | _sdis = chainerrl.distribution.SoftmaxDistribution( 28 | chainer.Variable(np.zeros((5, 5)))) 29 | _gdis = chainerrl.distribution.GaussianDistribution( 30 | chainer.Variable(np.zeros((5, 5), dtype=np.float32)), 31 | chainer.Variable(np.ones((5, 5), dtype=np.float32))) 32 | 33 | 34 | @testing.parameterize( 35 | {'obj': [], 'expected': []}, 36 | {'obj': (), 'expected': []}, 37 | {'obj': _v, 'expected': [_v]}, 38 | {'obj': _dav, 'expected': list(_dav.params)}, 39 | {'obj': _qav, 'expected': list(_qav.params)}, 40 | {'obj': _sdis, 'expected': list(_sdis.params)}, 41 | {'obj': _gdis, 'expected': list(_gdis.params)}, 42 | {'obj': [_v, _dav, _sdis], 43 | 'expected': [_v] + list(_dav.params) + list(_sdis.params)}, 44 | ) 45 | class TestCollectVariables(unittest.TestCase): 46 | 47 | def _assert_eq_var_list(self, a, b): 48 | # Equality between two Variable lists 49 | self.assertEqual(len(a), len(b)) 50 | self.assertTrue(isinstance(a, list)) 51 | self.assertTrue(isinstance(b, list)) 52 | for item in a: 53 | self.assertTrue(isinstance(item, chainer.Variable)) 54 | for item in b: 55 | self.assertTrue(isinstance(item, chainer.Variable)) 56 | for va, vb in zip(a, b): 57 | self.assertEqual(id(va), id(vb)) 58 | 59 | def test_collect_variables(self): 60 | vs = chainerrl.misc.collect_variables(self.obj) 61 | self._assert_eq_var_list(vs, self.expected) 62 | 63 | # Wrap by a list 64 | vs = chainerrl.misc.collect_variables([self.obj]) 65 | self._assert_eq_var_list(vs, self.expected) 66 | 67 | # Wrap by two lists 68 | vs = chainerrl.misc.collect_variables([[self.obj]]) 69 | self._assert_eq_var_list(vs, self.expected) 70 | 71 | # Wrap by a tuple 72 | vs = chainerrl.misc.collect_variables((self.obj,)) 73 | self._assert_eq_var_list(vs, self.expected) 74 | 75 | # Wrap by a two tuples 76 | vs = chainerrl.misc.collect_variables(((self.obj,),)) 77 | self._assert_eq_var_list(vs, self.expected) 78 | 79 | 80 | class TestDrawComputationalGraph(unittest.TestCase): 81 | 82 | def test_draw_computational_graph(self): 83 | x = chainer.Variable(np.zeros(5)) 84 | y = x ** 2 + chainer.Variable(np.ones(5)) 85 | dirname = tempfile.mkdtemp() 86 | filepath = os.path.join(dirname, 'graph') 87 | chainerrl.misc.draw_computational_graph(y, filepath) 88 | self.assertTrue(os.path.exists(filepath + '.gv')) 89 | if chainerrl.misc.is_graphviz_available(): 90 | self.assertTrue(os.path.exists(filepath + '.png')) 91 | else: 92 | self.assertFalse(os.path.exists(filepath + '.png')) 93 | -------------------------------------------------------------------------------- /chainerrl/links/mlp_bn.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import chainer 10 | from chainer import functions as F 11 | from chainer import links as L 12 | 13 | from chainerrl.initializers import LeCunNormal 14 | 15 | 16 | class LinearBN(chainer.Chain): 17 | """Linear layer with BatchNormalization.""" 18 | 19 | def __init__(self, in_size, out_size): 20 | super().__init__() 21 | with self.init_scope(): 22 | self.linear = L.Linear(in_size, out_size) 23 | bn = L.BatchNormalization(out_size) 24 | bn.avg_var[:] = 1 25 | self.bn = bn 26 | 27 | def __call__(self, x): 28 | return self.bn(self.linear(x)) 29 | 30 | 31 | class MLPBN(chainer.Chain): 32 | """Multi-Layer Perceptron with Batch Normalization. 33 | 34 | Args: 35 | in_size (int): Input size. 36 | out_size (int): Output size. 37 | hidden_sizes (list of ints): Sizes of hidden channels. 38 | normalize_input (bool): If set to True, Batch Normalization is applied 39 | to inputs. 40 | normalize_output (bool): If set to True, Batch Normalization is applied 41 | to outputs. 42 | nonlinearity (callable): Nonlinearity between layers. It must accept a 43 | Variable as an argument and return a Variable with the same shape. 44 | Nonlinearities with learnable parameters such as PReLU are not 45 | supported. 46 | last_wscale (float): Scale of weight initialization of the last layer. 47 | """ 48 | 49 | def __init__(self, in_size, out_size, hidden_sizes, normalize_input=True, 50 | normalize_output=False, nonlinearity=F.relu, last_wscale=1): 51 | self.in_size = in_size 52 | self.out_size = out_size 53 | self.hidden_sizes = hidden_sizes 54 | self.normalize_input = normalize_input 55 | self.normalize_output = normalize_output 56 | self.nonlinearity = nonlinearity 57 | 58 | super().__init__() 59 | with self.init_scope(): 60 | if normalize_input: 61 | self.input_bn = L.BatchNormalization(in_size) 62 | self.input_bn.avg_var[:] = 1 63 | 64 | if hidden_sizes: 65 | hidden_layers = [] 66 | hidden_layers.append(LinearBN(in_size, hidden_sizes[0])) 67 | for hin, hout in zip(hidden_sizes, hidden_sizes[1:]): 68 | hidden_layers.append(LinearBN(hin, hout)) 69 | self.hidden_layers = chainer.ChainList(*hidden_layers) 70 | self.output = L.Linear(hidden_sizes[-1], out_size, 71 | initialW=LeCunNormal(last_wscale)) 72 | else: 73 | self.output = L.Linear(in_size, out_size, 74 | initialW=LeCunNormal(last_wscale)) 75 | 76 | if normalize_output: 77 | self.output_bn = L.BatchNormalization(out_size) 78 | self.output_bn.avg_var[:] = 1 79 | 80 | def __call__(self, x): 81 | h = x 82 | assert (not chainer.config.train) or x.shape[0] > 1 83 | if self.normalize_input: 84 | h = self.input_bn(h) 85 | if self.hidden_sizes: 86 | for l in self.hidden_layers: 87 | h = self.nonlinearity(l(h)) 88 | h = self.output(h) 89 | if self.normalize_output: 90 | h = self.output_bn(h) 91 | return h 92 | -------------------------------------------------------------------------------- /tests/misc_tests/test_random.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | from __future__ import division 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | import timeit 10 | import unittest 11 | 12 | from chainer import testing 13 | from chainer.testing import condition 14 | import numpy as np 15 | from scipy import stats 16 | 17 | from chainerrl.misc.random import sample_n_k 18 | 19 | 20 | @testing.parameterize( 21 | {'n': 2, 'k': 2}, 22 | {'n': 5, 'k': 1}, 23 | {'n': 5, 'k': 4}, 24 | {'n': 7, 'k': 2}, 25 | {'n': 20, 'k': 10}, 26 | {'n': 100, 'k': 5}, 27 | {'n': 1, 'k': 0}, 28 | {'n': 0, 'k': 0}, 29 | ) 30 | class TestSampleNK(unittest.TestCase): 31 | def test_fast(self): 32 | self.samples = [sample_n_k(self.n, self.k) for _ in range(200)] 33 | self.subtest_constraints() 34 | 35 | def subtest_constraints(self): 36 | for s in self.samples: 37 | self.assertEqual(len(s), self.k) 38 | 39 | all(0 <= x < self.n for x in s) 40 | 41 | # distinct 42 | t = np.unique(s) 43 | self.assertEqual(len(t), self.k) 44 | 45 | @testing.attr.slow 46 | @condition.repeat_with_success_at_least(3, 2) 47 | def test_slow(self): 48 | self.samples = [sample_n_k(self.n, self.k) for _ in range(100000)] 49 | self.subtest_total_counts() 50 | self.subtest_order_counts() 51 | 52 | def subtest_total_counts(self): 53 | if self.k in [0, self.n]: 54 | return 55 | 56 | cnt = np.zeros(self.n) 57 | for s in self.samples: 58 | for x in s: 59 | cnt[x] += 1 60 | 61 | m = len(self.samples) 62 | 63 | p = self.k / self.n 64 | mean = m * p 65 | std = np.sqrt(m * p * (1 - p)) 66 | 67 | self.subtest_normal_distrib(cnt, mean, std) 68 | 69 | def subtest_order_counts(self): 70 | if self.k < 2: 71 | return 72 | 73 | ordered_pairs = [(i, j) for j in range(self.k) for i in range(j)] 74 | cnt = np.zeros(len(ordered_pairs)) 75 | 76 | for s in self.samples: 77 | for t, (i, j) in enumerate(ordered_pairs): 78 | if s[i] < s[j]: 79 | cnt[t] += 1 80 | 81 | m = len(self.samples) 82 | 83 | mean = m / 2 84 | std = np.sqrt(m / 4) 85 | 86 | self.subtest_normal_distrib(cnt, mean, std) 87 | 88 | def subtest_normal_distrib(self, xs, mean, std): 89 | _, pvalue = stats.kstest(xs, 'norm', (mean, std)) 90 | self.assertGreater(pvalue, 3e-3) 91 | 92 | 93 | class TestSampleNKSpeed(unittest.TestCase): 94 | def get_timeit(self, setup): 95 | return min(timeit.Timer( 96 | 'for n in range(64, 10000): sample_n_k(n, 64)', 97 | setup=setup). repeat(repeat=10, number=1)) 98 | 99 | @testing.attr.slow 100 | def _test(self): 101 | t = self.get_timeit( 102 | "from chainerrl.misc.random import sample_n_k") 103 | 104 | # faster than random.sample 105 | t1 = self.get_timeit(""" 106 | import random 107 | import six 108 | def sample_n_k(n, k): 109 | return random.sample(six.moves.range(n), k) 110 | """) 111 | self.assertLess(t, t1) 112 | 113 | # faster than np.random.choice(..., replace=False) 114 | t2 = self.get_timeit(""" 115 | import numpy as np 116 | def sample_n_k(n, k): 117 | return np.random.choice(n, k, replace=False) 118 | """) 119 | self.assertLess(t, t2) 120 | -------------------------------------------------------------------------------- /chainerrl/agents/dpp.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from builtins import * # NOQA 6 | from future import standard_library 7 | standard_library.install_aliases() # NOQA 8 | 9 | from abc import ABCMeta 10 | from abc import abstractmethod 11 | 12 | import chainer 13 | import chainer.functions as F 14 | from future.utils import with_metaclass 15 | 16 | from chainerrl.agents.dqn import DQN 17 | 18 | 19 | class AbstractDPP(with_metaclass(ABCMeta, DQN)): 20 | """Dynamic Policy Programming. 21 | 22 | See: https://arxiv.org/abs/1004.2027. 23 | """ 24 | 25 | @abstractmethod 26 | def _l_operator(self, qout): 27 | raise NotImplementedError() 28 | 29 | def _compute_target_values(self, exp_batch): 30 | 31 | batch_next_state = exp_batch['next_state'] 32 | 33 | target_next_qout = self.target_q_function(batch_next_state) 34 | next_q_expect = self._l_operator(target_next_qout) 35 | 36 | batch_rewards = exp_batch['reward'] 37 | batch_terminal = exp_batch['is_state_terminal'] 38 | 39 | return (batch_rewards + 40 | exp_batch['discount'] * (1 - batch_terminal) * next_q_expect) 41 | 42 | def _compute_y_and_t(self, exp_batch): 43 | 44 | batch_state = exp_batch['state'] 45 | batch_size = len(exp_batch['reward']) 46 | 47 | qout = self.q_function(batch_state) 48 | 49 | batch_actions = exp_batch['action'] 50 | # Q(s_t,a_t) 51 | batch_q = F.reshape(qout.evaluate_actions( 52 | batch_actions), (batch_size, 1)) 53 | 54 | with chainer.no_backprop_mode(): 55 | # Compute target values 56 | target_qout = self.target_q_function(batch_state) 57 | 58 | # Q'(s_t,a_t) 59 | target_q = F.reshape(target_qout.evaluate_actions( 60 | batch_actions), (batch_size, 1)) 61 | 62 | # LQ'(s_t,a) 63 | target_q_expect = F.reshape( 64 | self._l_operator(target_qout), (batch_size, 1)) 65 | 66 | # r + g * LQ'(s_{t+1},a) 67 | batch_q_target = F.reshape( 68 | self._compute_target_values(exp_batch), (batch_size, 1)) 69 | 70 | # Q'(s_t,a_t) + r + g * LQ'(s_{t+1},a) - LQ'(s_t,a) 71 | t = target_q + batch_q_target - target_q_expect 72 | 73 | return batch_q, t 74 | 75 | 76 | class DPP(AbstractDPP): 77 | """Dynamic Policy Programming with softmax operator. 78 | 79 | Args: 80 | eta (float): Positive constant. 81 | 82 | For other arguments, see DQN. 83 | """ 84 | 85 | def __init__(self, *args, **kwargs): 86 | self.eta = kwargs.pop('eta', 1.0) 87 | super().__init__(*args, **kwargs) 88 | 89 | def _l_operator(self, qout): 90 | return qout.compute_expectation(self.eta) 91 | 92 | 93 | class DPPL(AbstractDPP): 94 | """Dynamic Policy Programming with L operator. 95 | 96 | Args: 97 | eta (float): Positive constant. 98 | 99 | For other arguments, see DQN. 100 | """ 101 | 102 | def __init__(self, *args, **kwargs): 103 | self.eta = kwargs.pop('eta', 1.0) 104 | super().__init__(*args, **kwargs) 105 | 106 | def _l_operator(self, qout): 107 | return F.logsumexp(self.eta * qout.q_values, axis=1) / self.eta 108 | 109 | 110 | class DPPGreedy(AbstractDPP): 111 | """Dynamic Policy Programming with max operator. 112 | 113 | This algorithm corresponds to DPP with eta = infinity. 114 | """ 115 | 116 | def _l_operator(self, qout): 117 | return qout.max 118 | --------------------------------------------------------------------------------