├── MANIFEST.in
├── assets
    ├── ChainerRL.png
    ├── breakout.gif
    ├── grasping.gif
    └── humanoid.gif
├── chainerrl
    ├── v_functions
    │   ├── __init__.py
    │   └── v_functions.py
    ├── initializers
    │   ├── __init__.py
    │   ├── normal.py
    │   └── constant.py
    ├── envs
    │   ├── __init__.py
    │   └── serial_vector_env.py
    ├── optimizers
    │   ├── __init__.py
    │   ├── nonbias_weight_decay.py
    │   └── rmsprop_async.py
    ├── q_functions
    │   ├── __init__.py
    │   └── dueling_dqn.py
    ├── policies
    │   ├── __init__.py
    │   ├── mellowmax_policy.py
    │   └── softmax_policy.py
    ├── functions
    │   ├── __init__.py
    │   ├── scale_grad.py
    │   ├── bound_by_tanh.py
    │   ├── sum_arrays.py
    │   ├── weighted_sum_arrays.py
    │   ├── invert_gradients.py
    │   └── mellowmax.py
    ├── misc
    │   ├── ask_yes_no.py
    │   ├── __init__.py
    │   ├── makedirs.py
    │   ├── batch_states.py
    │   ├── is_return_code_zero.py
    │   ├── init_like_torch.py
    │   ├── random.py
    │   ├── reward_filter.py
    │   ├── conjugate_gradient.py
    │   ├── random_seed.py
    │   ├── draw_computational_graph.py
    │   ├── env_modifiers.py
    │   └── copy_param.py
    ├── explorers
    │   ├── __init__.py
    │   ├── greedy.py
    │   ├── additive_gaussian.py
    │   ├── boltzmann.py
    │   ├── additive_ou.py
    │   └── epsilon_greedy.py
    ├── wrappers
    │   ├── __init__.py
    │   ├── scale_reward.py
    │   ├── render.py
    │   ├── cast_observation.py
    │   ├── continuing_time_limit.py
    │   └── randomize_action.py
    ├── links
    │   ├── __init__.py
    │   ├── mlp.py
    │   ├── noisy_chain.py
    │   ├── sequence.py
    │   ├── dqn_head.py
    │   ├── noisy_linear.py
    │   └── mlp_bn.py
    ├── v_function.py
    ├── q_function.py
    ├── policy.py
    ├── experiments
    │   ├── __init__.py
    │   └── hooks.py
    ├── explorer.py
    ├── agents
    │   ├── __init__.py
    │   ├── double_dqn.py
    │   ├── sarsa.py
    │   ├── residual_dqn.py
    │   ├── double_pal.py
    │   ├── al.py
    │   ├── pal.py
    │   └── dpp.py
    ├── env.py
    └── __init__.py
├── readthedocs.yml
├── .gitignore
├── requirements-dev.txt
├── docs
    ├── reference.rst
    ├── recurrent.rst
    ├── experiments.rst
    ├── action_values.rst
    ├── install.rst
    ├── distributions.rst
    ├── Makefile
    ├── agents.rst
    ├── make.bat
    └── index.rst
├── requirements.txt
├── examples
    ├── README.md
    ├── ale
    │   ├── README.md
    │   └── dqn_phi.py
    ├── gym
    │   └── README.md
    ├── grasping
    │   └── README.md
    └── atari
    │   └── dqn
    │       └── README.md
├── tests
    ├── misc_tests
    │   ├── test_is_return_code_zero.py
    │   ├── test_conjugate_gradient.py
    │   ├── test_batch_states.py
    │   ├── test_random_seed.py
    │   ├── test_copy_param.py
    │   ├── test_collections.py
    │   ├── test_draw_computational_graph.py
    │   └── test_random.py
    ├── explorers_tests
    │   ├── test_additive_gaussian.py
    │   ├── test_additive_ou.py
    │   ├── test_boltzmann.py
    │   └── test_epsilon_greedy.py
    ├── experiments_tests
    │   ├── test_hooks.py
    │   └── test_train_agent.py
    ├── q_functions_tests
    │   └── basetest_state_action_q_function.py
    ├── agents_tests
    │   ├── test_ddpg.py
    │   ├── test_sarsa.py
    │   ├── test_pgt.py
    │   ├── test_al.py
    │   ├── test_pal.py
    │   ├── test_double_pal.py
    │   ├── test_double_dqn.py
    │   ├── basetest_agents.py
    │   ├── test_residual_dqn.py
    │   └── test_dpp.py
    ├── links_tests
    │   ├── test_noisy_chain.py
    │   ├── test_mlp_bn.py
    │   ├── test_sequence.py
    │   ├── test_noisy_linear.py
    │   └── test_empirical_normalization.py
    ├── wrappers_tests
    │   ├── test_scale_reward.py
    │   ├── test_continuing_time_limit.py
    │   ├── test_cast_observation.py
    │   ├── test_render.py
    │   └── test_randomize_action.py
    ├── optimizer_tests
    │   └── test_nonbias_weight_decay.py
    ├── functions_tests
    │   ├── test_sum_arrays.py
    │   ├── test_weighted_sum_arrays.py
    │   ├── test_lower_triangular_matrix.py
    │   └── test_invert_gradients.py
    ├── test_ale.py
    ├── envs_tests
    │   └── test_vector_envs.py
    └── test_agent.py
├── setup.py
├── LICENSE
├── CONTRIBUTING.md
├── .travis.yml
└── tools
    └── plot_scores.py


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | 


--------------------------------------------------------------------------------
/assets/ChainerRL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imos/chainerrl/master/assets/ChainerRL.png


--------------------------------------------------------------------------------
/assets/breakout.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imos/chainerrl/master/assets/breakout.gif


--------------------------------------------------------------------------------
/assets/grasping.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imos/chainerrl/master/assets/grasping.gif


--------------------------------------------------------------------------------
/assets/humanoid.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imos/chainerrl/master/assets/humanoid.gif


--------------------------------------------------------------------------------
/chainerrl/v_functions/__init__.py:
--------------------------------------------------------------------------------
1 | from chainerrl.v_functions.v_functions import *  # NOQA
2 | 


--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
1 | name: chainerrl
2 | type: sphinx
3 | base: docs
4 | python:
5 |     setup_py_install: true
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .ipynb_checkpoints
3 | chainerrl.egg-info
4 | build/
5 | dist/
6 | .idea/
7 | results/
8 | examples/gym/results/
9 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | -r requirements.txt
 2 | autopep8
 3 | atari_py
 4 | flake8
 5 | mock
 6 | opencv-python
 7 | pytest
 8 | sphinx
 9 | sphinx_rtd_theme
10 | 


--------------------------------------------------------------------------------
/chainerrl/initializers/__init__.py:
--------------------------------------------------------------------------------
1 | from chainerrl.initializers.constant import VarianceScalingConstant  # NOQA
2 | from chainerrl.initializers.normal import LeCunNormal  # NOQA
3 | 


--------------------------------------------------------------------------------
/chainerrl/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from chainerrl.envs.multiprocess_vector_env import MultiprocessVectorEnv  # NOQA
2 | from chainerrl.envs.serial_vector_env import SerialVectorEnv  # NOQA
3 | 


--------------------------------------------------------------------------------
/chainerrl/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from chainerrl.optimizers.nonbias_weight_decay import NonbiasWeightDecay  # noqa
2 | from chainerrl.optimizers.rmsprop_async import RMSpropAsync  # noqa
3 | 


--------------------------------------------------------------------------------
/docs/reference.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | API Reference
 3 | =============
 4 | 
 5 | .. toctree::
 6 |    :maxdepth: 1
 7 | 
 8 |    action_values
 9 |    agents
10 |    distributions
11 |    experiments
12 |    recurrent
13 | 


--------------------------------------------------------------------------------
/chainerrl/q_functions/__init__.py:
--------------------------------------------------------------------------------
1 | from chainerrl.q_functions.dueling_dqn import *  # NOQA
2 | from chainerrl.q_functions.state_action_q_functions import *  # NOQA
3 | from chainerrl.q_functions.state_q_functions import *  # NOQA
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cached-property
 2 | chainer>=3.1.0
 3 | fastcache; python_version<'3.2'
 4 | funcsigs; python_version<'3.5'
 5 | future
 6 | gym>=0.9.7
 7 | numpy>=1.10.4
 8 | pillow
 9 | scipy
10 | statistics; python_version<'3.4'
11 | 


--------------------------------------------------------------------------------
/chainerrl/policies/__init__.py:
--------------------------------------------------------------------------------
1 | from chainerrl.policies.deterministic_policy import *  # NOQA
2 | from chainerrl.policies.gaussian_policy import *  # NOQA
3 | from chainerrl.policies.mellowmax_policy import *  # NOQA
4 | from chainerrl.policies.softmax_policy import *  # NOQA
5 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | 
3 | - `ale`: examples for Atari 2600 games in the Arcade Learning Environment
4 | - `gym`: examples for OpenAI Gym environments
5 | - `grasping`: examples for a Bullet-based robotic grasping environment
6 | - `quickstart`: a quickstart guide of ChainerRL
7 | 


--------------------------------------------------------------------------------
/chainerrl/functions/__init__.py:
--------------------------------------------------------------------------------
1 | from chainerrl.functions.sum_arrays import sum_arrays  # NOQA
2 | from chainerrl.functions.sum_arrays import SumArrays  # NOQA
3 | from chainerrl.functions.weighted_sum_arrays import weighted_sum_arrays  # NOQA
4 | from chainerrl.functions.weighted_sum_arrays import WeightedSumArrays  # NOQA
5 | 


--------------------------------------------------------------------------------
/chainerrl/misc/ask_yes_no.py:
--------------------------------------------------------------------------------
 1 | from builtins import *  # NOQA
 2 | 
 3 | 
 4 | def ask_yes_no(question):
 5 |     while True:
 6 |         choice = input("{} [y/N]: ".format(question)).lower()
 7 |         if choice in ['y', 'ye', 'yes']:
 8 |             return True
 9 |         elif choice in ['n', 'no']:
10 |             return False
11 | 


--------------------------------------------------------------------------------
/docs/recurrent.rst:
--------------------------------------------------------------------------------
 1 | ======================
 2 | Using recurrent models
 3 | ======================
 4 | 
 5 | Recurrent model interface
 6 | =========================
 7 | 
 8 | .. autoclass:: chainerrl.recurrent.Recurrent
 9 |    :members:
10 | 
11 | Utilities
12 | =========
13 | 
14 | .. autofunction:: chainerrl.recurrent.state_kept
15 | 
16 | .. autofunction:: chainerrl.recurrent.state_reset
17 | 


--------------------------------------------------------------------------------
/chainerrl/explorers/__init__.py:
--------------------------------------------------------------------------------
1 | from chainerrl.explorers.additive_gaussian import AdditiveGaussian  # NOQA
2 | from chainerrl.explorers.additive_ou import AdditiveOU  # NOQA
3 | from chainerrl.explorers.boltzmann import Boltzmann  # NOQA
4 | from chainerrl.explorers.epsilon_greedy import ConstantEpsilonGreedy  # NOQA
5 | from chainerrl.explorers.epsilon_greedy import LinearDecayEpsilonGreedy  # NOQA
6 | from chainerrl.explorers.greedy import Greedy  # NOQA
7 | 


--------------------------------------------------------------------------------
/docs/experiments.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Experiments
 3 | ===========
 4 | 
 5 | Training and evaluation
 6 | =======================
 7 | 
 8 | .. autofunction:: chainerrl.experiments.train_agent_async
 9 | 
10 | .. autofunction:: chainerrl.experiments.train_agent_with_evaluation
11 | 
12 | Training hooks
13 | ==============
14 | 
15 | .. autoclass:: chainerrl.experiments.StepHook
16 |    :members:
17 | 
18 | .. autoclass:: chainerrl.experiments.LinearInterpolationHook
19 | 


--------------------------------------------------------------------------------
/chainerrl/initializers/normal.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import numpy as np
 3 | 
 4 | 
 5 | class LeCunNormal(chainer.initializers.HeNormal):
 6 |     """LeCunNormal is (essentially) the default initializer in Chainer v1.
 7 | 
 8 |     chainer.initializers.LeCunNormal is not available yet.
 9 |     (Chainer Pull Request #2764 has not been merged.)
10 |     """
11 | 
12 |     def __init__(self, scale=1.0, dtype=None):
13 |         super(LeCunNormal, self).__init__(np.sqrt(0.5) * scale, dtype)
14 | 


--------------------------------------------------------------------------------
/docs/action_values.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | Action values
 3 | =============
 4 | 
 5 | Action value interfaces
 6 | =======================
 7 | 
 8 | .. autoclass:: chainerrl.action_value.ActionValue
 9 |    :members:
10 | 
11 | Action value implementations
12 | ============================
13 | 
14 | .. autoclass:: chainerrl.action_value.DiscreteActionValue
15 | 
16 | .. autoclass:: chainerrl.action_value.QuadraticActionValue
17 | 
18 | .. autoclass:: chainerrl.action_value.SingleActionValue
19 | 


--------------------------------------------------------------------------------
/chainerrl/wrappers/__init__.py:
--------------------------------------------------------------------------------
 1 | from chainerrl.wrappers.cast_observation import CastObservation  # NOQA
 2 | from chainerrl.wrappers.cast_observation import CastObservationToFloat32  # NOQA
 3 | 
 4 | from chainerrl.wrappers.continuing_time_limit import ContinuingTimeLimit  # NOQA
 5 | 
 6 | from chainerrl.wrappers.randomize_action import RandomizeAction  # NOQA
 7 | 
 8 | from chainerrl.wrappers.render import Render  # NOQA
 9 | 
10 | from chainerrl.wrappers.scale_reward import ScaleReward  # NOQA
11 | 


--------------------------------------------------------------------------------
/chainerrl/links/__init__.py:
--------------------------------------------------------------------------------
1 | from chainerrl.links.dqn_head import NatureDQNHead  # NOQA
2 | from chainerrl.links.dqn_head import NIPSDQNHead  # NOQA
3 | from chainerrl.links.empirical_normalization import EmpiricalNormalization  # NOQA
4 | from chainerrl.links.mlp import MLP  # NOQA
5 | from chainerrl.links.mlp_bn import MLPBN  # NOQA
6 | from chainerrl.links.noisy_chain import to_factorized_noisy  # NOQA
7 | from chainerrl.links.noisy_linear import FactorizedNoisyLinear  # NOQA
8 | from chainerrl.links.sequence import Sequence  # NOQA
9 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Installation
 3 | ============
 4 | 
 5 | How to install ChainerRL
 6 | ========================
 7 | 
 8 | ChainerRL is tested with Python 2.7+ and 3.5.1+. For other requirements, see ``requirements.txt``.
 9 | 
10 | .. literalinclude:: ../requirements.txt
11 |   :caption: requirements.txt
12 | 
13 | ChainerRL can be installed via PyPI,
14 | 
15 | ::
16 | 
17 |  pip install chainerrl
18 | 
19 | or through the source code:
20 | 
21 | ::
22 | 
23 |  git clone https://github.com/chainer/chainerrl.git
24 |  cd chainerrl
25 |  python setup.py install
26 | 


--------------------------------------------------------------------------------
/docs/distributions.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | Distributions
 3 | =============
 4 | 
 5 | Distribution interfaces
 6 | =======================
 7 | 
 8 | .. autoclass:: chainerrl.distribution.Distribution
 9 |    :members:
10 | 
11 | 
12 | Distribution implementations
13 | ============================
14 | 
15 | .. autoclass:: chainerrl.distribution.GaussianDistribution
16 | 
17 | .. autoclass:: chainerrl.distribution.SoftmaxDistribution
18 | 
19 | .. autoclass:: chainerrl.distribution.MellowmaxDistribution
20 | 
21 | .. autoclass:: chainerrl.distribution.ContinuousDeterministicDistribution
22 | 


--------------------------------------------------------------------------------
/chainerrl/v_function.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from abc import ABCMeta
10 | from abc import abstractmethod
11 | 
12 | from future.utils import with_metaclass
13 | 
14 | 
15 | class VFunction(with_metaclass(ABCMeta, object)):
16 | 
17 |     @abstractmethod
18 |     def __call__(self, x):
19 |         raise NotImplementedError()
20 | 


--------------------------------------------------------------------------------
/chainerrl/misc/__init__.py:
--------------------------------------------------------------------------------
1 | from chainerrl.misc.batch_states import batch_states  # NOQA
2 | from chainerrl.misc.conjugate_gradient import conjugate_gradient  # NOQA
3 | from chainerrl.misc.draw_computational_graph import collect_variables  # NOQA
4 | from chainerrl.misc.draw_computational_graph import draw_computational_graph  # NOQA
5 | from chainerrl.misc.draw_computational_graph import is_graphviz_available  # NOQA
6 | from chainerrl.misc import env_modifiers  # NOQA
7 | from chainerrl.misc.is_return_code_zero import is_return_code_zero  # NOQA
8 | from chainerrl.misc.random_seed import set_random_seed  # NOQA
9 | 


--------------------------------------------------------------------------------
/chainerrl/explorers/greedy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from chainerrl import explorer
10 | 
11 | 
12 | class Greedy(explorer.Explorer):
13 |     """No exploration"""
14 | 
15 |     def select_action(self, t, greedy_action_func, action_value=None):
16 |         return greedy_action_func()
17 | 
18 |     def __repr__(self):
19 |         return 'Greedy()'
20 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = ChainerRL
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/chainerrl/misc/makedirs.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import os
10 | import six
11 | 
12 | 
13 | def makedirs(name, mode=0o777, exist_ok=False):
14 |     """An wrapper of os.makedirs that accepts exist_ok."""
15 |     if six.PY2:
16 |         try:
17 |             os.makedirs(name, mode)
18 |         except OSError:
19 |             if not os.path.isdir(name):
20 |                 raise
21 |     else:
22 |         os.makedirs(name, mode, exist_ok=exist_ok)
23 | 


--------------------------------------------------------------------------------
/chainerrl/initializers/constant.py:
--------------------------------------------------------------------------------
 1 | from chainer import initializer
 2 | from chainer.initializers import Constant
 3 | import numpy
 4 | 
 5 | 
 6 | class VarianceScalingConstant(initializer.Initializer):
 7 |     def __init__(self, scale=1.0, dtype=None):
 8 |         super(VarianceScalingConstant, self).__init__(dtype)
 9 |         self.scale = scale
10 | 
11 |     def __call__(self, array):
12 |         if self.dtype is not None:
13 |             assert array.dtype == self.dtype
14 | 
15 |         if len(array.shape) == 1:
16 |             Constant(self.scale / numpy.sqrt(array.shape[0]))(array)
17 |         else:
18 |             fan_in, _ = initializer.get_fans(array.shape)
19 | 
20 |             Constant(self.scale / numpy.sqrt(fan_in))(array)
21 | 


--------------------------------------------------------------------------------
/chainerrl/misc/batch_states.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | 
 3 | 
 4 | def batch_states(states, xp, phi):
 5 |     """The default method for making batch of observations.
 6 | 
 7 |     Args:
 8 |         states (list): list of observations from an environment.
 9 |         xp (module): numpy or cupy
10 |         phi (callable): Feature extractor applied to observations
11 | 
12 |     Return:
13 |         the object which will be given as input to the model.
14 |     """
15 |     if chainer.cuda.available and xp is chainer.cuda.cupy:
16 |         # GPU
17 |         device = chainer.cuda.Device().id
18 |     else:
19 |         # CPU
20 |         device = -1
21 | 
22 |     features = [phi(s) for s in states]
23 |     return chainer.dataset.concat_examples(features, device=device)
24 | 


--------------------------------------------------------------------------------
/tests/misc_tests/test_is_return_code_zero.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | 
 8 | import unittest
 9 | 
10 | import chainerrl
11 | 
12 | 
13 | class TestIsReturnCodeZero(unittest.TestCase):
14 | 
15 |     def test(self):
16 |         # Assume ls command exists
17 |         self.assertTrue(chainerrl.misc.is_return_code_zero(['ls']))
18 |         self.assertFalse(chainerrl.misc.is_return_code_zero(
19 |             ['ls --nonexistentoption']))
20 |         self.assertFalse(chainerrl.misc.is_return_code_zero(
21 |             ['nonexistentcommand']))
22 | 


--------------------------------------------------------------------------------
/examples/ale/README.md:
--------------------------------------------------------------------------------
 1 | # Examples for Arcade Learning Environment
 2 | 
 3 | - `train_a3c_ale.py`: A3C
 4 | - `train_acer_ale.py`: ACER
 5 | - `train_categorical_dqn_ale.py`: CategoricalDQN
 6 | - `train_dqn_ale.py`: DQN, DoubleDQN or PAL
 7 | - `train_nsq_ale.py`: NSQ (n-step Q-learning)
 8 | - `train_ppo_ale.py`: PPO
 9 | 
10 | ## Requirements
11 | 
12 | - atari_py>=0.1.1
13 | - opencv-python
14 | 
15 | ## How to run
16 | 
17 | ```
18 | python train_a3c_ale.py n_processes [options]
19 | python train_acer_ale.py n_processes [options]
20 | python train_categorical_dqn_ale.py [options]
21 | python train_dqn_ale.py [options]
22 | python train_nsq_ale.py n_processes [options]
23 | python train_ppo_ale.py [options]
24 | ```
25 | 
26 | Specify `--help` or read code for options.
27 | 


--------------------------------------------------------------------------------
/chainerrl/q_function.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from abc import ABCMeta
10 | from abc import abstractmethod
11 | 
12 | from future.utils import with_metaclass
13 | 
14 | 
15 | class StateQFunction(with_metaclass(ABCMeta, object)):
16 | 
17 |     @abstractmethod
18 |     def __call__(self, x):
19 |         raise NotImplementedError()
20 | 
21 | 
22 | class StateActionQFunction(with_metaclass(ABCMeta, object)):
23 | 
24 |     @abstractmethod
25 |     def __call__(self, x, a):
26 |         raise NotImplementedError()
27 | 


--------------------------------------------------------------------------------
/chainerrl/policy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from abc import ABCMeta
10 | from abc import abstractmethod
11 | 
12 | from future.utils import with_metaclass
13 | 
14 | from logging import getLogger
15 | logger = getLogger(__name__)
16 | 
17 | 
18 | class Policy(with_metaclass(ABCMeta, object)):
19 |     """Abstract policy."""
20 | 
21 |     @abstractmethod
22 |     def __call__(self, state):
23 |         """Evaluate a policy.
24 | 
25 |         Returns:
26 |             Distribution of actions
27 |         """
28 |         raise NotImplementedError()
29 | 


--------------------------------------------------------------------------------
/chainerrl/experiments/__init__.py:
--------------------------------------------------------------------------------
 1 | from chainerrl.experiments.evaluator import eval_performance  # NOQA
 2 | 
 3 | from chainerrl.experiments.hooks import LinearInterpolationHook  # NOQA
 4 | from chainerrl.experiments.hooks import StepHook  # NOQA
 5 | 
 6 | from chainerrl.experiments.prepare_output_dir import is_under_git_control  # NOQA
 7 | from chainerrl.experiments.prepare_output_dir import prepare_output_dir  # NOQA
 8 | 
 9 | from chainerrl.experiments.train_agent import train_agent  # NOQA
10 | from chainerrl.experiments.train_agent import train_agent_with_evaluation  # NOQA
11 | from chainerrl.experiments.train_agent_async import train_agent_async  # NOQA
12 | from chainerrl.experiments.train_agent_batch import train_agent_batch  # NOQA
13 | from chainerrl.experiments.train_agent_batch import train_agent_batch_with_evaluation  # NOQA
14 | 


--------------------------------------------------------------------------------
/examples/ale/dqn_phi.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | def dqn_phi(screens):
13 |     """Phi (feature extractor) of DQN for ALE
14 | 
15 |     Args:
16 |       screens: List of N screen objects. Each screen object must be
17 |       numpy.ndarray whose dtype is numpy.uint8.
18 |     Returns:
19 |       numpy.ndarray
20 |     """
21 |     assert len(screens) == 4
22 |     assert screens[0].dtype == np.uint8
23 |     raw_values = np.asarray(screens, dtype=np.float32)
24 |     # [0,255] -> [0, 1]
25 |     raw_values /= 255.0
26 |     return raw_values
27 | 


--------------------------------------------------------------------------------
/tests/explorers_tests/test_additive_gaussian.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | import unittest
 8 | 
 9 | import numpy as np
10 | 
11 | from chainerrl.explorers.additive_gaussian import AdditiveGaussian
12 | 
13 | 
14 | class TestAdditiveGaussian(unittest.TestCase):
15 | 
16 |     def test(self):
17 | 
18 |         action_size = 3
19 |         scale = 0.1
20 | 
21 |         def greedy_action_func():
22 |             return np.asarray([0] * action_size, dtype=np.float32)
23 | 
24 |         explorer = AdditiveGaussian(scale)
25 | 
26 |         for t in range(1000):
27 |             a = explorer.select_action(t, greedy_action_func)
28 |             print(a)
29 | 


--------------------------------------------------------------------------------
/chainerrl/explorer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from abc import ABCMeta
10 | from abc import abstractmethod
11 | from future.utils import with_metaclass
12 | 
13 | 
14 | class Explorer(with_metaclass(ABCMeta, object)):
15 |     """Abstract explorer."""
16 | 
17 |     @abstractmethod
18 |     def select_action(self, t, greedy_action_func, action_value=None):
19 |         """Select an action.
20 | 
21 |         Args:
22 |           t: current time step
23 |           greedy_action_func: function with no argument that returns an action
24 |           action_value (ActionValue): ActionValue object
25 |         """
26 |         raise NotImplementedError()
27 | 


--------------------------------------------------------------------------------
/chainerrl/functions/scale_grad.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | import numpy
 8 | 
 9 | import chainer
10 | from chainer.utils import type_check
11 | 
12 | 
13 | class ScaleGrad(chainer.Function):
14 | 
15 |     def __init__(self, scale):
16 |         self.scale = scale
17 | 
18 |     def check_type_forward(self, in_types):
19 |         type_check.expect(
20 |             in_types.size() == 1,
21 |             in_types[0].dtype == numpy.float32
22 |         )
23 | 
24 |     def forward(self, x):
25 |         return x
26 | 
27 |     def backward(self, x, gy):
28 |         return tuple(g * self.scale for g in gy)
29 | 
30 | 
31 | def scale_grad(x, scale):
32 |     return ScaleGrad(scale=scale)(x)
33 | 


--------------------------------------------------------------------------------
/chainerrl/misc/is_return_code_zero.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | 
 8 | import os
 9 | import subprocess
10 | 
11 | 
12 | def is_return_code_zero(args):
13 |     """Return true iff the given command's return code is zero.
14 | 
15 |     All the messages to stdout or stderr are suppressed.
16 |     """
17 |     with open(os.devnull, 'wb') as FNULL:
18 |         try:
19 |             subprocess.check_call(args, stdout=FNULL, stderr=FNULL)
20 |         except subprocess.CalledProcessError:
21 |             # The given command returned an error
22 |             return False
23 |         except OSError:
24 |             # The given command was not found
25 |             return False
26 |         return True
27 | 


--------------------------------------------------------------------------------
/chainerrl/wrappers/scale_reward.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import gym
10 | 
11 | 
12 | class ScaleReward(gym.RewardWrapper):
13 |     """Scale reward by a scale factor.
14 | 
15 |     Args:
16 |         env: Env to wrap.
17 |         scale (float): Scale factor.
18 | 
19 |     Attributes:
20 |         scale: Scale factor.
21 |         original_reward: Reward before casting.
22 |     """
23 | 
24 |     def __init__(self, env, scale):
25 |         super().__init__(env)
26 |         self.scale = scale
27 |         self.original_reward = None
28 | 
29 |     def _reward(self, reward):
30 |         self.original_reward = reward
31 |         return self.scale * reward
32 | 


--------------------------------------------------------------------------------
/examples/gym/README.md:
--------------------------------------------------------------------------------
 1 | # Examples for OpenAI Gym environments
 2 | 
 3 | - `train_a3c_gym.py`: A3C for both discrete action and continuous action spaces
 4 | - `train_acer_gym.py`: DiscreteACER for discrete action spaces
 5 | - `train_dqn_gym.py`: DQN for both discrete action and continuous action spaces
 6 | - `train_ddpg_gym.py`: DDPG for continuous action spaces
 7 | - `train_pcl_gym.py`: PCL for both discrete action and continuous action spaces
 8 | - `train_reinforce_gym.py`: REINFORCE for both discrete action and continuous action spaces (only for episodic envs)
 9 | 
10 | ## How to run
11 | 
12 | ```
13 | python train_a3c_gym.py n_processes [options]
14 | python train_acer_gym.py n_processes [options]
15 | python train_dqn_gym.py [options]
16 | python train_ddpg_gym.py [options]
17 | python train_pcl_gym.py [options]
18 | python train_reinforce_gym.py [options]
19 | ```
20 | 
21 | Specify `--help` or read code for options.
22 | 


--------------------------------------------------------------------------------
/docs/agents.rst:
--------------------------------------------------------------------------------
 1 | ======
 2 | Agents
 3 | ======
 4 | 
 5 | Agent interfaces
 6 | ================
 7 | 
 8 | .. autoclass:: chainerrl.agent.Agent
 9 |    :members:
10 | 
11 | Agent implementations
12 | =====================
13 | 
14 | .. autoclass:: chainerrl.agents.A3C
15 | 
16 | .. autoclass:: chainerrl.agents.ACER
17 | 
18 | .. autoclass:: chainerrl.agents.AL
19 | 
20 | .. autoclass:: chainerrl.agents.DDPG
21 | 
22 | .. autoclass:: chainerrl.agents.DoubleDQN
23 | 
24 | .. autoclass:: chainerrl.agents.DoublePAL
25 | 
26 | .. autoclass:: chainerrl.agents.DPP
27 | 
28 | .. autoclass:: chainerrl.agents.DQN
29 | 
30 | .. autoclass:: chainerrl.agents.NSQ
31 | 
32 | .. autoclass:: chainerrl.agents.PAL
33 | 
34 | .. autoclass:: chainerrl.agents.PCL
35 | 
36 | .. autoclass:: chainerrl.agents.PGT
37 | 
38 | .. autoclass:: chainerrl.agents.REINFORCE
39 | 
40 | .. autoclass:: chainerrl.agents.ResidualDQN
41 | 
42 | .. autoclass:: chainerrl.agents.SARSA
43 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=ChainerRL
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. ChainerRL documentation master file, created by
 2 |    sphinx-quickstart on Tue Mar 14 22:25:44 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | ================================================
 7 | ChainerRL, a deep reinforcement learning library
 8 | ================================================
 9 | 
10 | ChainerRL is a deep reinforcement learning library that implements various state-of-the-art deep reinforcement algorithms in Python using `Chainer <https://github.com/pfnet/chainer>`_, a flexible deep learning framework.
11 | 
12 | .. toctree::
13 |    :maxdepth: 2
14 | 
15 |    install
16 |    Quickstart Guide <https://github.com/pfnet/chainerrl/blob/master/examples/quickstart/quickstart.ipynb>
17 |    reference
18 | 
19 | 
20 | 
21 | Indices and tables
22 | ==================
23 | 
24 | * :ref:`genindex`
25 | * :ref:`modindex`
26 | * :ref:`search`
27 | 


--------------------------------------------------------------------------------
/chainerrl/wrappers/render.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import gym
10 | 
11 | 
12 | class Render(gym.Wrapper):
13 |     """Render env by calling its render method.
14 | 
15 |     Args:
16 |         env (gym.Env): Env to wrap.
17 |         **kwargs: Keyword arguments passed to the render method.
18 |     """
19 | 
20 |     def __init__(self, env, **kwargs):
21 |         super().__init__(env)
22 |         self._kwargs = kwargs
23 | 
24 |     def reset(self, **kwargs):
25 |         ret = self.env.reset(**kwargs)
26 |         self.env.render(**self._kwargs)
27 |         return ret
28 | 
29 |     def step(self, action):
30 |         ret = self.env.step(action)
31 |         self.env.render(**self._kwargs)
32 |         return ret
33 | 


--------------------------------------------------------------------------------
/examples/grasping/README.md:
--------------------------------------------------------------------------------
 1 | # Bullet-based robotic grasping
 2 | 
 3 | This directory contains example scripts that learn to grasp objects in an environment simulated by Bullet, a physics simulator.
 4 | 
 5 | ![Grasping](../../assets/grasping.gif)
 6 | 
 7 | ## Files
 8 | 
 9 | - `train_dqn_batch_grasping.py`: DoubleDQN + prioritized experience replay
10 | 
11 | ## Requirements
12 | 
13 | - pybullet>=2.1.2
14 | 
15 | ## How to run
16 | 
17 | Train with one simulator, which is slow.
18 | ```
19 | python examples/pybullet/train_dqn_batch_grasping.py
20 | ```
21 | 
22 | Train with 96 simulators run in parallel, which is faster.
23 | ```
24 | python examples/pybullet/train_dqn_batch_grasping.py --num-envs 96
25 | ```
26 | 
27 | Watch how the learned agent performs. `<path to agent>` must be a path to a directory where the agent was saved (e.g. `2000000_finish` created inside the output directory specified as `--outdir`).
28 | ```
29 | python examples/pybullet/train_dqn_batch_grasping.py --demo --render --load <path to agent>
30 | ```
31 | 


--------------------------------------------------------------------------------
/tests/experiments_tests/test_hooks.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | import unittest
 9 | 
10 | import numpy as np
11 | 
12 | import chainerrl
13 | 
14 | 
15 | class TestLinearInterpolationHook(unittest.TestCase):
16 | 
17 |     def test_call(self):
18 | 
19 |         buf = []
20 | 
21 |         def setter(env, agent, value):
22 |             buf.append(value)
23 | 
24 |         hook = chainerrl.experiments.LinearInterpolationHook(
25 |             total_steps=10,
26 |             start_value=0.1,
27 |             stop_value=1.0,
28 |             setter=setter)
29 | 
30 |         for step in range(1, 10 + 1):
31 |             hook(env=None, agent=None, step=step)
32 | 
33 |         np.testing.assert_allclose(
34 |             buf, np.arange(1, 10 + 1, dtype=np.float32) / 10)
35 | 


--------------------------------------------------------------------------------
/chainerrl/explorers/additive_gaussian.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import numpy as np
10 | 
11 | from chainerrl import explorer
12 | 
13 | 
14 | class AdditiveGaussian(explorer.Explorer):
15 |     """Additive Gaussian noise to actions.
16 | 
17 |     Each action must be numpy.ndarray.
18 | 
19 |     Args:
20 |         scale (float or array_like of floats): Scale parameter.
21 |     """
22 | 
23 |     def __init__(self, scale):
24 |         self.scale = scale
25 | 
26 |     def select_action(self, t, greedy_action_func, action_value=None):
27 |         a = greedy_action_func()
28 |         noise = np.random.normal(
29 |             scale=self.scale, size=a.shape).astype(np.float32)
30 |         return a + noise
31 | 
32 |     def __repr__(self):
33 |         return 'AdditiveGaussian(scale={})'.format(self.scale)
34 | 


--------------------------------------------------------------------------------
/chainerrl/agents/__init__.py:
--------------------------------------------------------------------------------
 1 | from chainerrl.agents.a2c import A2C  # NOQA
 2 | from chainerrl.agents.a3c import A3C  # NOQA
 3 | from chainerrl.agents.acer import ACER  # NOQA
 4 | from chainerrl.agents.al import AL  # NOQA
 5 | from chainerrl.agents.categorical_dqn import CategoricalDQN  # NOQA
 6 | from chainerrl.agents.ddpg import DDPG  # NOQA
 7 | from chainerrl.agents.double_dqn import DoubleDQN  # NOQA
 8 | from chainerrl.agents.double_pal import DoublePAL  # NOQA
 9 | from chainerrl.agents.dpp import DPP  # NOQA
10 | from chainerrl.agents.dqn import DQN  # NOQA
11 | from chainerrl.agents.iqn import IQN  # NOQA
12 | from chainerrl.agents.nsq import NSQ  # NOQA
13 | from chainerrl.agents.pal import PAL  # NOQA
14 | from chainerrl.agents.pcl import PCL  # NOQA
15 | from chainerrl.agents.pgt import PGT  # NOQA
16 | from chainerrl.agents.ppo import PPO  # NOQA
17 | from chainerrl.agents.reinforce import REINFORCE  # NOQA
18 | from chainerrl.agents.residual_dqn import ResidualDQN  # NOQA
19 | from chainerrl.agents.sarsa import SARSA  # NOQA
20 | from chainerrl.agents.trpo import TRPO  # NOQA
21 | 


--------------------------------------------------------------------------------
/chainerrl/functions/bound_by_tanh.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import chainer
10 | from chainer import cuda
11 | from chainer import functions as F
12 | 
13 | 
14 | def bound_by_tanh(x, low, high):
15 |     """Bound a given value into [low, high] by tanh.
16 | 
17 |     Args:
18 |         x (chainer.Variable): value to bound
19 |         low (numpy.ndarray): lower bound
20 |         high (numpy.ndarray): upper bound
21 |     Returns: chainer.Variable
22 |     """
23 |     assert isinstance(x, chainer.Variable)
24 |     assert low is not None
25 |     assert high is not None
26 |     xp = cuda.get_array_module(x.array)
27 |     x_scale = (high - low) / 2
28 |     x_scale = xp.expand_dims(xp.asarray(x_scale), axis=0)
29 |     x_mean = (high + low) / 2
30 |     x_mean = xp.expand_dims(xp.asarray(x_mean), axis=0)
31 |     return F.tanh(x) * x_scale + x_mean
32 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | from setuptools import find_packages
 3 | from setuptools import setup
 4 | import sys
 5 | 
 6 | install_requires = [
 7 |     'cached-property',
 8 |     'chainer>=2.0.0',
 9 |     'future',
10 |     'gym>=0.9.7',
11 |     'numpy>=1.10.4',
12 |     'pillow',
13 |     'scipy',
14 | ]
15 | 
16 | test_requires = [
17 |     'pytest',
18 | ]
19 | 
20 | if sys.version_info < (3, 2):
21 |     install_requires.append('fastcache')
22 | 
23 | if sys.version_info < (3, 4):
24 |     install_requires.append('statistics')
25 | 
26 | if sys.version_info < (3, 5):
27 |     install_requires.append('funcsigs')
28 | 
29 | setup(name='chainerrl',
30 |       version='0.5.0',
31 |       description='ChainerRL, a deep reinforcement learning library',
32 |       long_description=codecs.open('README.md', 'r', encoding='utf-8').read(),
33 |       long_description_content_type='text/markdown',
34 |       author='Yasuhiro Fujita',
35 |       author_email='fujita@preferred.jp',
36 |       license='MIT License',
37 |       packages=find_packages(),
38 |       install_requires=install_requires,
39 |       test_requires=test_requires)
40 | 


--------------------------------------------------------------------------------
/chainerrl/policies/mellowmax_policy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from logging import getLogger
10 | 
11 | import chainer
12 | 
13 | from chainerrl import distribution
14 | from chainerrl.policy import Policy
15 | 
16 | 
17 | logger = getLogger(__name__)
18 | 
19 | 
20 | class MellowmaxPolicy(chainer.Chain, Policy):
21 |     """Mellowmax policy.
22 | 
23 |     See: http://arxiv.org/abs/1612.05628
24 | 
25 |     Args:
26 |         model (chainer.Link):
27 |             Link that is callable and outputs action values.
28 |         omega (float):
29 |             Parameter of the mellowmax function.
30 |     """
31 | 
32 |     def __init__(self, model, omega=1.):
33 |         self.omega = omega
34 |         super().__init__(model=model)
35 | 
36 |     def __call__(self, x):
37 |         h = self.model(x)
38 |         return distribution.MellowmaxDistribution(h, omega=self.omega)
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Preferred Networks, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/chainerrl/agents/double_dqn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | 
 8 | import chainer
 9 | 
10 | from chainerrl.agents import dqn
11 | from chainerrl.recurrent import state_kept
12 | 
13 | 
14 | class DoubleDQN(dqn.DQN):
15 |     """Double DQN.
16 | 
17 |     See: http://arxiv.org/abs/1509.06461.
18 |     """
19 | 
20 |     def _compute_target_values(self, exp_batch):
21 | 
22 |         batch_next_state = exp_batch['next_state']
23 | 
24 |         with chainer.using_config('train', False), state_kept(self.q_function):
25 |             next_qout = self.q_function(batch_next_state)
26 | 
27 |         target_next_qout = self.target_q_function(batch_next_state)
28 | 
29 |         next_q_max = target_next_qout.evaluate_actions(
30 |             next_qout.greedy_actions)
31 | 
32 |         batch_rewards = exp_batch['reward']
33 |         batch_terminal = exp_batch['is_state_terminal']
34 |         discount = exp_batch['discount']
35 | 
36 |         return batch_rewards + discount * (1.0 - batch_terminal) * next_q_max
37 | 


--------------------------------------------------------------------------------
/chainerrl/misc/init_like_torch.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | from chainer import links as L
 8 | import numpy as np
 9 | 
10 | 
11 | def init_like_torch(link):
12 |     # Mimic torch's default parameter initialization
13 |     # TODO(muupan): Use chainer's initializers when it is merged
14 |     for l in link.links():
15 |         if isinstance(l, L.Linear):
16 |             out_channels, in_channels = l.W.shape
17 |             stdv = 1 / np.sqrt(in_channels)
18 |             l.W.array[:] = np.random.uniform(-stdv, stdv, size=l.W.shape)
19 |             if l.b is not None:
20 |                 l.b.array[:] = np.random.uniform(-stdv, stdv, size=l.b.shape)
21 |         elif isinstance(l, L.Convolution2D):
22 |             out_channels, in_channels, kh, kw = l.W.shape
23 |             stdv = 1 / np.sqrt(in_channels * kh * kw)
24 |             l.W.array[:] = np.random.uniform(-stdv, stdv, size=l.W.shape)
25 |             if l.b is not None:
26 |                 l.b.array[:] = np.random.uniform(-stdv, stdv, size=l.b.shape)
27 | 


--------------------------------------------------------------------------------
/tests/explorers_tests/test_additive_ou.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | import unittest
 8 | 
 9 | from chainer import testing
10 | import numpy as np
11 | 
12 | from chainerrl.explorers.additive_ou import AdditiveOU
13 | 
14 | 
15 | @testing.parameterize(*testing.product({
16 |     'action_size': [1, 3],
17 |     'sigma_type': ['scalar', 'ndarray'],
18 | }))
19 | class TestAdditiveOU(unittest.TestCase):
20 | 
21 |     def test(self):
22 | 
23 |         def greedy_action_func():
24 |             return np.asarray([0] * self.action_size, dtype=np.float32)
25 | 
26 |         if self.sigma_type == 'scalar':
27 |             sigma = np.random.rand()
28 |         elif self.sigma_type == 'ndarray':
29 |             sigma = np.random.rand(self.action_size)
30 |         theta = np.random.rand()
31 | 
32 |         explorer = AdditiveOU(theta=theta, sigma=sigma)
33 | 
34 |         print('theta:', theta, 'sigma', sigma)
35 |         for t in range(100):
36 |             a = explorer.select_action(t, greedy_action_func)
37 |             print(t, a)
38 | 


--------------------------------------------------------------------------------
/chainerrl/misc/random.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | def sample_n_k(n, k):
13 |     """Sample k distinct elements uniformly from range(n)"""
14 | 
15 |     if not 0 <= k <= n:
16 |         raise ValueError("Sample larger than population or is negative")
17 |     if k == 0:
18 |         return np.empty((0,), dtype=np.int64)
19 |     elif 3 * k >= n:
20 |         return np.random.choice(n, k, replace=False)
21 |     else:
22 |         result = np.random.choice(n, 2 * k)
23 |         selected = set()
24 |         selected_add = selected.add
25 |         j = k
26 |         for i in range(k):
27 |             x = result[i]
28 |             while x in selected:
29 |                 x = result[i] = result[j]
30 |                 j += 1
31 |                 if j == 2 * k:
32 |                     # This is slow, but it rarely happens.
33 |                     result[k:] = np.random.choice(n, k)
34 |                     j = k
35 |             selected_add(x)
36 |         return result[:k]
37 | 


--------------------------------------------------------------------------------
/chainerrl/explorers/boltzmann.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import chainer
10 | from chainer import functions as F
11 | import numpy as np
12 | 
13 | import chainerrl
14 | 
15 | 
16 | class Boltzmann(chainerrl.explorer.Explorer):
17 |     """Boltzmann exploration.
18 | 
19 |     Args:
20 |         T (float): Temperature of Boltzmann distribution.
21 |     """
22 | 
23 |     def __init__(self, T=1.0):
24 |         self.T = T
25 | 
26 |     def select_action(self, t, greedy_action_func, action_value=None):
27 |         assert action_value is not None
28 |         assert isinstance(action_value,
29 |                           chainerrl.action_value.DiscreteActionValue)
30 |         n_actions = action_value.q_values.shape[1]
31 |         with chainer.no_backprop_mode():
32 |             probs = chainer.cuda.to_cpu(
33 |                 F.softmax(action_value.q_values / self.T).array).ravel()
34 |         return np.random.choice(np.arange(n_actions), p=probs)
35 | 
36 |     def __repr__(self):
37 |         return 'Boltzmann(T={})'.format(self.T)
38 | 


--------------------------------------------------------------------------------
/chainerrl/wrappers/cast_observation.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import gym
10 | import numpy as np
11 | 
12 | 
13 | class CastObservation(gym.ObservationWrapper):
14 |     """Cast observations to a given type.
15 | 
16 |     Args:
17 |         env: Env to wrap.
18 |         dtype: Data type object.
19 | 
20 |     Attributes:
21 |         original_observation: Observation before casting.
22 |     """
23 | 
24 |     def __init__(self, env, dtype):
25 |         super().__init__(env)
26 |         self.dtype = dtype
27 | 
28 |     def _observation(self, observation):
29 |         self.original_observation = observation
30 |         return observation.astype(self.dtype, copy=False)
31 | 
32 | 
33 | class CastObservationToFloat32(CastObservation):
34 |     """Cast observations to float32, which is common in Chainer.
35 | 
36 |     Args:
37 |         env: Env to wrap.
38 | 
39 |     Attributes:
40 |         original_observation: Observation before casting.
41 |     """
42 | 
43 |     def __init__(self, env):
44 |         super().__init__(env, np.float32)
45 | 


--------------------------------------------------------------------------------
/tests/q_functions_tests/basetest_state_action_q_function.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import unittest
10 | 
11 | import chainer
12 | import numpy as np
13 | 
14 | 
15 | class _TestSAQFunction(unittest.TestCase):
16 | 
17 |     def _test_call_given_model(self, model, gpu):
18 |         # This method only check if a given model can receive random input
19 |         # data and return output data with the correct interface.
20 |         batch_size = 7
21 |         obs = np.random.rand(batch_size, self.n_dim_obs).astype(np.float32)
22 |         action = np.random.rand(
23 |             batch_size, self.n_dim_action).astype(np.float32)
24 |         if gpu >= 0:
25 |             model.to_gpu(gpu)
26 |             obs = chainer.cuda.to_gpu(obs)
27 |             action = chainer.cuda.to_gpu(action)
28 |         y = model(obs, action)
29 |         self.assertTrue(isinstance(y, chainer.Variable))
30 |         self.assertEqual(y.shape, (batch_size, 1))
31 |         self.assertEqual(chainer.cuda.get_array_module(y),
32 |                          chainer.cuda.get_array_module(obs))
33 | 


--------------------------------------------------------------------------------
/tests/agents_tests/test_ddpg.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import basetest_ddpg as base
10 | from chainerrl.agents.ddpg import DDPG
11 | 
12 | 
13 | class TestDDPGOnContinuousPOABC(base._TestDDPGOnContinuousPOABC):
14 | 
15 |     def make_ddpg_agent(self, env, model, actor_opt, critic_opt, explorer,
16 |                         rbuf, gpu):
17 |         return DDPG(model, actor_opt, critic_opt, rbuf, gpu=gpu, gamma=0.9,
18 |                     explorer=explorer, replay_start_size=100,
19 |                     target_update_method='soft', target_update_interval=1,
20 |                     episodic_update=True, update_interval=1)
21 | 
22 | 
23 | class TestDDPGOnContinuousABC(base._TestDDPGOnContinuousABC):
24 | 
25 |     def make_ddpg_agent(self, env, model, actor_opt, critic_opt, explorer,
26 |                         rbuf, gpu):
27 |         return DDPG(model, actor_opt, critic_opt, rbuf, gpu=gpu, gamma=0.9,
28 |                     explorer=explorer, replay_start_size=100,
29 |                     target_update_method='soft', target_update_interval=1,
30 |                     episodic_update=False)
31 | 


--------------------------------------------------------------------------------
/chainerrl/misc/reward_filter.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | 
 8 | 
 9 | class NormalizedRewardFilter(object):
10 | 
11 |     def __init__(self, tau=1e-3, scale=1, eps=1e-1):
12 |         self.tau = tau
13 |         self.scale = scale
14 |         self.average_reward = 0
15 |         self.average_reward_squared = 0
16 |         self.eps = eps
17 | 
18 |     def __call__(self, reward):
19 |         self.average_reward *= 1 - self.tau
20 |         self.average_reward += self.tau * reward
21 |         self.average_reward_squared *= 1 - self.tau
22 |         self.average_reward_squared += self.tau * reward ** 2
23 |         var = self.average_reward_squared - self.average_reward ** 2
24 |         stdev = min(var, self.eps) ** 0.5
25 |         return self.scale * (reward - self.average_reward) / stdev
26 | 
27 | 
28 | class AverageRewardFilter(object):
29 | 
30 |     def __init__(self, tau=1e-3):
31 |         self.tau = tau
32 |         self.average_reward = 0
33 | 
34 |     def __call__(self, reward):
35 |         self.average_reward *= 1 - self.tau
36 |         self.average_reward += self.tau * reward
37 |         return reward - self.average_reward
38 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to ChainerRL
 2 | 
 3 | Any kind of contribution to ChainerRL would be highly appreciated!
 4 | 
 5 | Contribution examples:
 6 | - Thumbing up to good issues or pull requests :+1:
 7 | - Opening issues about questions, bugs, installation problems, feature requests, algorithm requests etc.
 8 | - Sending pull requests
 9 | 
10 | If you could kindly send a PR to ChainerRL, please make sure all the tests successfully pass.
11 | 
12 | ## Testing
13 | 
14 | To test chainerrl modules, install and run `pytest`. Pass `-m "not gpu"` to skip tests that require gpu. E.g.
15 | ```
16 | $ pip install pytest
17 | $ pytest -m "not gpu"
18 | ```
19 | 
20 | 
21 | To test examples, run `test_examples.sh [gpu device id]`. `-1` would run examples with only cpu.
22 | 
23 | ## Coding style
24 | 
25 | We use PEP8. To check your code, use `autopep8` and `flake8` packages.
26 | ```
27 | $ pip install autopep8 flake8
28 | $ autopep8 --diff path/to/your/code.py
29 | $ flake8 path/to/your/code.py
30 | ```
31 | 
32 | 
33 | To use Python 3 features as much as possible while keeping Python 2 support, add the following lines to the head of each file.
34 | ```
35 | from __future__ import print_function
36 | from __future__ import unicode_literals
37 | from __future__ import division
38 | from __future__ import absolute_import
39 | from builtins import *  # NOQA
40 | from future import standard_library
41 | standard_library.install_aliases()
42 | ```
43 | 


--------------------------------------------------------------------------------
/tests/links_tests/test_noisy_chain.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import chainer
 4 | 
 5 | from chainerrl.links import to_factorized_noisy
 6 | 
 7 | 
 8 | def names_of_links(link):
 9 |     return set([name for name, _ in link.namedlinks(skipself=True)])
10 | 
11 | 
12 | class TestToFactorizedNoisy(unittest.TestCase):
13 |     def test_chainlist(self):
14 |         ch = chainer.ChainList(
15 |             chainer.links.Linear(3, 4),
16 |             chainer.links.Linear(5),
17 |             chainer.links.PReLU(),
18 |         )
19 |         self.assertEqual(
20 |             names_of_links(ch),
21 |             {'/0', '/1', '/2'})
22 | 
23 |         to_factorized_noisy(ch)
24 |         self.assertEqual(
25 |             names_of_links(ch),
26 |             {
27 |                 '/0', '/0/mu', '/0/sigma',
28 |                 '/1', '/1/mu', '/1/sigma', '/2'})
29 | 
30 |     def test_chain(self):
31 |         ch = chainer.Chain()
32 |         with ch.init_scope():
33 |             ch.l1 = chainer.links.Linear(3, 4)
34 |             ch.l2 = chainer.links.Linear(5)
35 |             ch.l3 = chainer.links.PReLU()
36 |         self.assertEqual(
37 |             names_of_links(ch),
38 |             {'/l1', '/l2', '/l3'})
39 | 
40 |         to_factorized_noisy(ch)
41 |         self.assertEqual(
42 |             names_of_links(ch),
43 |             {
44 |                 '/l1', '/l1/mu', '/l1/sigma',
45 |                 '/l2', '/l2/mu', '/l2/sigma', '/l3'})
46 | 


--------------------------------------------------------------------------------
/tests/agents_tests/test_sarsa.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | from builtins import *  # NOQA
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import basetest_dqn_like as base
10 | from chainerrl.agents import SARSA
11 | 
12 | # Sarsa does not support batch training
13 | 
14 | 
15 | class TestSARSAOnDiscreteABC(base._TestDQNOnDiscreteABC):
16 | 
17 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
18 |         return SARSA(
19 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
20 |             replay_start_size=100, target_update_interval=100)
21 | 
22 | 
23 | class TestSARSAOnContinuousABC(base._TestDQNOnContinuousABC):
24 | 
25 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
26 |         return SARSA(
27 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
28 |             replay_start_size=100, target_update_interval=100)
29 | 
30 | 
31 | class TestSARSAOnDiscretePOABC(base._TestDQNOnDiscretePOABC):
32 | 
33 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
34 |         return SARSA(
35 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
36 |             replay_start_size=100, target_update_interval=100,
37 |             episodic_update=True)
38 | 


--------------------------------------------------------------------------------
/tests/wrappers_tests/test_scale_reward.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | 
10 | import unittest
11 | 
12 | from chainer import testing
13 | import gym
14 | 
15 | import chainerrl
16 | 
17 | 
18 | @testing.parameterize(*testing.product({
19 |     'env_id': ['CartPole-v1', 'MountainCar-v0'],
20 |     'scale': [1.0, 0.1]
21 | }))
22 | class TestScaleReward(unittest.TestCase):
23 | 
24 |     def test_scale_reward(self):
25 |         env = chainerrl.wrappers.ScaleReward(
26 |             gym.make(self.env_id), scale=self.scale)
27 |         self.assertIsNone(env.original_reward)
28 |         self.assertAlmostEqual(env.scale, self.scale)
29 | 
30 |         _ = env.reset()
31 |         _, r, _, _ = env.step(env.action_space.sample())
32 | 
33 |         if self.env_id == 'CartPole-v1':
34 |             # Original reward must be 1
35 |             self.assertAlmostEqual(env.original_reward, 1)
36 |             self.assertAlmostEqual(r, self.scale)
37 |         elif self.env_id == 'MountainCar-v0':
38 |             # Original reward must be -1
39 |             self.assertAlmostEqual(env.original_reward, -1)
40 |             self.assertAlmostEqual(r, -self.scale)
41 |         else:
42 |             assert False
43 | 


--------------------------------------------------------------------------------
/tests/agents_tests/test_pgt.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import basetest_pgt as base
10 | from chainerrl.agents.pgt import PGT
11 | 
12 | 
13 | # Currently PGT does not support recurrent models
14 | # class TestPGTOnContinuousPOABC(base._TestPGTOnContinuousPOABC):
15 | #
16 | #     def make_pgt_agent(self, env, model, actor_opt, critic_opt, explorer,
17 | #                        rbuf, gpu):
18 | #         return PGT(model, actor_opt, critic_opt, rbuf, gpu=gpu, gamma=0.9,
19 | #                    explorer=explorer, replay_start_size=100,
20 | #                    target_update_method='soft', target_update_interval=1,
21 | #                    episodic_update=True, update_interval=1,
22 | #                    act_deterministically=True)
23 | 
24 | 
25 | class TestPGTOnContinuousABC(base._TestPGTOnContinuousABC):
26 | 
27 |     def make_pgt_agent(self, env, model, actor_opt, critic_opt, explorer,
28 |                        rbuf, gpu):
29 |         return PGT(model, actor_opt, critic_opt, rbuf, gpu=gpu, gamma=0.9,
30 |                    explorer=explorer, replay_start_size=100,
31 |                    target_update_method='soft', target_update_interval=1,
32 |                    act_deterministically=True)
33 | 


--------------------------------------------------------------------------------
/chainerrl/functions/sum_arrays.py:
--------------------------------------------------------------------------------
 1 | from chainer import cuda
 2 | from chainer import function
 3 | from chainer import utils
 4 | from chainer.utils import type_check
 5 | 
 6 | 
 7 | class SumArrays(function.Function):
 8 |     """Element-wise sum of input arrays."""
 9 | 
10 |     def check_type_forward(self, in_types):
11 |         type_check.expect(
12 |             in_types[0].dtype.kind == 'f',
13 |         )
14 | 
15 |     def forward_cpu(self, inputs):
16 |         y = sum(inputs)
17 |         return utils.force_array(y),
18 | 
19 |     def backward(self, inputs, grads):
20 |         return [grads[0]] * len(inputs)
21 | 
22 |     def forward_gpu(self, inputs):
23 |         n = len(inputs)
24 |         ptrs = cuda.cupy.asarray([x.data.ptr for x in inputs],
25 |                                  dtype=cuda.cupy.int64)
26 |         y = cuda.elementwise(
27 |             'T x0, int64 xs, int32 n_xs',
28 |             'T y',
29 |             'float** xs_ = (float**) xs;'
30 |             'y = 0;'
31 |             'for (size_t j = 0; j < n_xs; ++j) {'
32 |             '  y += xs_[j][i];'
33 |             '}',
34 |             'sum_arrays'.format(n))(inputs[0], ptrs.data.ptr, len(ptrs))
35 |         return y,
36 | 
37 | 
38 | def sum_arrays(xs):
39 |     """Element-wise sum of input arrays.
40 | 
41 |     Args:
42 |         xs (tuple of ~chainer.Variable or ndarray): Input arrays to be summed.
43 | 
44 |     Returns:
45 |         ~chainer.Variable: Output variable.
46 |     """
47 |     return SumArrays()(*xs)
48 | 


--------------------------------------------------------------------------------
/chainerrl/agents/sarsa.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from chainerrl.agents import dqn
10 | 
11 | 
12 | class SARSA(dqn.DQN):
13 |     """SARSA.
14 | 
15 |     Unlike DQN, this agent uses actions that have been actually taken to
16 |     compute target Q values, thus is an on-policy algorithm.
17 |     """
18 | 
19 |     def _compute_target_values(self, exp_batch):
20 | 
21 |         batch_next_state = exp_batch['next_state']
22 |         batch_next_action = exp_batch['next_action']
23 | 
24 |         next_target_action_value = self.target_q_function(
25 |             batch_next_state)
26 |         next_q = next_target_action_value.evaluate_actions(
27 |             batch_next_action)
28 | 
29 |         batch_rewards = exp_batch['reward']
30 |         batch_terminal = exp_batch['is_state_terminal']
31 |         discount = exp_batch['discount']
32 | 
33 |         return batch_rewards + discount * (1.0 - batch_terminal) * next_q
34 | 
35 |     def batch_act_and_train(self, batch_obs):
36 |         raise NotImplementedError('SARSA does not support batch training')
37 | 
38 |     def batch_observe_and_train(self, batch_obs, batch_reward,
39 |                                 batch_done, batch_reset):
40 |         raise NotImplementedError('SARSA does not support batch training')
41 | 


--------------------------------------------------------------------------------
/chainerrl/env.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | from future.utils import with_metaclass
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from abc import ABCMeta
10 | from abc import abstractmethod
11 | 
12 | 
13 | class Env(with_metaclass(ABCMeta, object)):
14 |     """RL learning environment.
15 | 
16 |     This serves a minimal interface for RL agents.
17 |     """
18 | 
19 |     @abstractmethod
20 |     def step(self, action):
21 |         raise NotImplementedError()
22 | 
23 |     @abstractmethod
24 |     def reset(self):
25 |         raise NotImplementedError()
26 | 
27 |     @abstractmethod
28 |     def close(self):
29 |         raise NotImplementedError()
30 | 
31 | 
32 | class VectorEnv(with_metaclass(ABCMeta, object)):
33 |     """Parallel RL learning environments."""
34 | 
35 |     @abstractmethod
36 |     def step(self, action):
37 |         raise NotImplementedError()
38 | 
39 |     @abstractmethod
40 |     def reset(self, mask):
41 |         """Reset envs.
42 | 
43 |         Args:
44 |             mask (Sequence of bool): Mask array that specifies which env to
45 |                 skip. If omitted, all the envs are reset.
46 |         """
47 |         raise NotImplementedError()
48 | 
49 |     @abstractmethod
50 |     def seed(self, seeds):
51 |         raise NotImplementedError()
52 | 
53 |     @abstractmethod
54 |     def close(self):
55 |         raise NotImplementedError()
56 | 


--------------------------------------------------------------------------------
/chainerrl/misc/conjugate_gradient.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import chainer
10 | 
11 | 
12 | def conjugate_gradient(A_product_func, b, tol=1e-10, max_iter=10):
13 |     """Conjugate Gradient (CG) method.
14 | 
15 |     This function solves Ax=b for the vector x, where A is a real
16 |     positive-definite matrix and b is a real vector.
17 | 
18 |     Args:
19 |         A_product_func (callable): Callable that returns the product of the
20 |             matrix A and a given vector.
21 |         b (numpy.ndarray or cupy.ndarray): The vector b.
22 |         tol (float): Tolerance parameter for early stopping.
23 |         max_iter (int): Maximum number of iterations.
24 | 
25 |     Returns:
26 |         numpy.ndarray or cupy.ndarray: The solution.
27 |             The array module will be the same as the argument b's.
28 |     """
29 |     xp = chainer.cuda.get_array_module(b)
30 |     x = xp.zeros_like(b)
31 |     r0 = b - A_product_func(x)
32 |     p = r0
33 |     for i in range(max_iter):
34 |         a = xp.dot(r0, r0) / xp.dot(A_product_func(p), p)
35 |         x = x + p * a
36 |         r1 = r0 - A_product_func(p) * a
37 |         if xp.linalg.norm(r1) < tol:
38 |             return x
39 |         b = xp.dot(r1, r1) / xp.dot(r0, r0)
40 |         p = r1 + b * p
41 |         r0 = r1
42 |     return x
43 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | cache: pip
 3 | python:
 4 |   - "2.7"
 5 |   - "3.5.1"
 6 | env:
 7 |   - CHAINER_VERSION=3
 8 |   - CHAINER_VERSION=stable
 9 | # command to install dependencies
10 | install:
11 |   - pip install --upgrade pip setuptools wheel
12 |   - |
13 |     if [[ $CHAINER_VERSION == 3 ]]; then
14 |       pip install "chainer==3.1.0"
15 |     else
16 |       pip install chainer
17 |     fi
18 |   - pip install pytest-cov
19 |   - pip install -r requirements.txt --only-binary=numpy,scipy
20 |   - pip install jupyter
21 |   # gym 0.11.0 causes an error with Python 2
22 |   - pip install "gym!=0.11.0"
23 |   # atari_py==0.1.4 causes an error
24 |   - pip install atari_py==0.1.1
25 |   - pip install autopep8
26 |   - pip install flake8
27 |   - pip install coveralls
28 |   - pip install opencv-python
29 |   - pip install pybullet
30 |   - python setup.py develop
31 |   - python -c "import numpy; numpy.show_config()"
32 | before_script:
33 |   - "export DISPLAY=:99.0"
34 |   - sh -e /etc/init.d/xvfb start
35 |   - sleep 3
36 | # command to run tests
37 | script:
38 |   - flake8 chainerrl
39 |   - flake8 tests
40 |   - flake8 examples
41 |   - autopep8 -r chainerrl tests examples --diff | tee check_autopep8
42 |   - test ! -s check_autopep8
43 |   - pytest -m "not gpu and not slow" -x tests --cov=chainerrl
44 |   - ./test_examples.sh -1
45 |   - if [[ $TRAVIS_PYTHON_VERSION == 3.5.1 && $CHAINER_VERSION == stable ]]; then jupyter nbconvert --to notebook --execute examples/quickstart/quickstart.ipynb --ExecutePreprocessor.timeout=600; fi
46 | after_success:
47 |   - coveralls
48 | 


--------------------------------------------------------------------------------
/chainerrl/optimizers/nonbias_weight_decay.py:
--------------------------------------------------------------------------------
 1 | # This caused an error in py2 because cupy expect non-unicode str
 2 | # from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import division
 5 | from __future__ import absolute_import
 6 | from builtins import *  # NOQA
 7 | from future import standard_library
 8 | standard_library.install_aliases()  # NOQA
 9 | from chainer import cuda
10 | 
11 | 
12 | class NonbiasWeightDecay(object):
13 | 
14 |     """Weight decay only for non-bias parameters.
15 | 
16 |     This hook can be used just like chainer.optimizer_hooks.WeightDecay except
17 |     that this hook does not apply weight decay to bias parameters.
18 | 
19 |     This hook assumes that all the bias parameters have the name of "b". Any
20 |     parameter whose name is "b" is considered as a bias and excluded from
21 |     weight decay.
22 |     """
23 |     name = 'NonbiasWeightDecay'
24 |     call_for_each_param = True
25 |     timing = 'pre'
26 | 
27 |     def __init__(self, rate):
28 |         self.rate = rate
29 | 
30 |     def __call__(self, rule, param):
31 |         if param.name == 'b':
32 |             return
33 |         p, g = param.array, param.grad
34 |         if p is None or g is None:
35 |             return
36 |         with cuda.get_device_from_array(p) as dev:
37 |             if int(dev) == -1:
38 |                 g += self.rate * p
39 |             else:
40 |                 kernel = cuda.elementwise(
41 |                     'T p, T decay', 'T g', 'g += decay * p', 'weight_decay')
42 |                 kernel(p, self.rate, g)
43 | 


--------------------------------------------------------------------------------
/chainerrl/misc/random_seed.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import os
10 | import random
11 | 
12 | import chainer
13 | import numpy as np
14 | 
15 | 
16 | def set_random_seed(seed, gpus=()):
17 |     """Set a given random seed to ChainerRL's random sources.
18 | 
19 |     This function sets a given random seed to random sources that ChainerRL
20 |     depends on so that ChainerRL can be deterministic. It is not responsible
21 |     for setting a random seed to environments ChainerRL is applied to.
22 | 
23 |     Note that there's no guaranteed way to make all the computations done by
24 |     Chainer deterministic. See https://github.com/chainer/chainer/issues/4134.
25 | 
26 |     Args:
27 |         seed (int): Random seed [0, 2 ** 32).
28 |         gpus (tuple of ints): GPU device IDs to use. Negative values are
29 |             ignored.
30 |     """
31 |     # ChainerRL depends on random
32 |     random.seed(seed)
33 |     # ChainerRL depends on numpy.random
34 |     np.random.seed(seed)
35 |     # ChainerRL depends on cupy.random for GPU computation
36 |     for gpu in gpus:
37 |         if gpu >= 0:
38 |             with chainer.cuda.get_device_from_id(gpu):
39 |                 chainer.cuda.cupy.random.seed(seed)
40 |     # chainer.functions.n_step_rnn directly depends on CHAINER_SEED
41 |     os.environ['CHAINER_SEED'] = str(seed)
42 | 


--------------------------------------------------------------------------------
/tests/agents_tests/test_al.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | from builtins import *  # NOQA
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import basetest_dqn_like as base
10 | from basetest_training import _TestBatchTrainingMixin
11 | from chainerrl.agents.al import AL
12 | 
13 | 
14 | class TestALOnDiscreteABC(
15 |         _TestBatchTrainingMixin,
16 |         base._TestDQNOnDiscreteABC):
17 | 
18 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
19 |         return AL(
20 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
21 |             replay_start_size=100, target_update_interval=100)
22 | 
23 | 
24 | class TestALOnContinuousABC(
25 |         _TestBatchTrainingMixin,
26 |         base._TestDQNOnContinuousABC):
27 | 
28 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
29 |         return AL(
30 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
31 |             replay_start_size=100, target_update_interval=100)
32 | 
33 | 
34 | # Batch training with recurrent models is currently not supported
35 | class TestALOnDiscretePOABC(base._TestDQNOnDiscretePOABC):
36 | 
37 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
38 |         return AL(
39 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
40 |             replay_start_size=100, target_update_interval=100,
41 |             episodic_update=True)
42 | 


--------------------------------------------------------------------------------
/tests/agents_tests/test_pal.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import basetest_dqn_like as base
10 | from basetest_training import _TestBatchTrainingMixin
11 | from chainerrl.agents.pal import PAL
12 | 
13 | 
14 | class TestPALOnDiscreteABC(
15 |         _TestBatchTrainingMixin,
16 |         base._TestDQNOnDiscreteABC):
17 | 
18 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
19 |         return PAL(
20 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
21 |             replay_start_size=100, target_update_interval=100)
22 | 
23 | 
24 | class TestPALOnContinuousABC(
25 |         _TestBatchTrainingMixin,
26 |         base._TestDQNOnContinuousABC):
27 | 
28 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
29 |         return PAL(
30 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
31 |             replay_start_size=100, target_update_interval=100)
32 | 
33 | 
34 | # Batch training with recurrent models is currently not supported
35 | class TestPALOnDiscretePOABC(base._TestDQNOnDiscretePOABC):
36 | 
37 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
38 |         return PAL(
39 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
40 |             replay_start_size=100, target_update_interval=100,
41 |             episodic_update=True)
42 | 


--------------------------------------------------------------------------------
/tests/agents_tests/test_double_pal.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | from builtins import *  # NOQA
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from chainerrl.agents.double_pal import DoublePAL
10 | 
11 | import basetest_dqn_like
12 | from basetest_training import _TestBatchTrainingMixin
13 | 
14 | 
15 | class TestDoublePALOnDiscreteABC(
16 |         _TestBatchTrainingMixin,
17 |         basetest_dqn_like._TestDQNOnDiscreteABC):
18 | 
19 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
20 |         return DoublePAL(
21 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
22 |             replay_start_size=100, target_update_interval=100)
23 | 
24 | 
25 | class TestDoublePALOnContinuousABC(
26 |         _TestBatchTrainingMixin,
27 |         basetest_dqn_like._TestDQNOnContinuousABC):
28 | 
29 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
30 |         return DoublePAL(
31 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
32 |             replay_start_size=100, target_update_interval=100)
33 | 
34 | 
35 | class TestDoublePALOnDiscretePOABC(basetest_dqn_like._TestDQNOnDiscretePOABC):
36 | 
37 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
38 |         return DoublePAL(
39 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
40 |             replay_start_size=100, target_update_interval=100,
41 |             episodic_update=True)
42 | 


--------------------------------------------------------------------------------
/chainerrl/v_functions/v_functions.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import chainer
10 | from chainer import functions as F
11 | 
12 | from chainerrl.links.mlp import MLP
13 | from chainerrl.recurrent import RecurrentChainMixin
14 | from chainerrl.v_function import VFunction
15 | 
16 | 
17 | class SingleModelVFunction(
18 |         chainer.Chain, VFunction, RecurrentChainMixin):
19 |     """V-function
20 | 
21 |     Args:
22 |         model (chainer.Link):
23 |             Link that is callable, inputs states, and outputs state values.
24 |     """
25 | 
26 |     def __init__(self, model):
27 |         super().__init__(model=model)
28 | 
29 |     def __call__(self, x):
30 |         h = self.model(x)
31 |         return h
32 | 
33 | 
34 | class FCVFunction(SingleModelVFunction):
35 | 
36 |     def __init__(self, n_input_channels, n_hidden_layers=0,
37 |                  n_hidden_channels=None, nonlinearity=F.relu,
38 |                  last_wscale=1):
39 |         self.n_input_channels = n_input_channels
40 |         self.n_hidden_layers = n_hidden_layers
41 |         self.n_hidden_channels = n_hidden_channels
42 | 
43 |         super().__init__(
44 |             model=MLP(self.n_input_channels, 1,
45 |                       [self.n_hidden_channels] * self.n_hidden_layers,
46 |                       nonlinearity=nonlinearity,
47 |                       last_wscale=last_wscale),
48 |         )
49 | 


--------------------------------------------------------------------------------
/tests/wrappers_tests/test_continuing_time_limit.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import mock
10 | import unittest
11 | 
12 | from chainer import testing
13 | 
14 | import chainerrl
15 | 
16 | 
17 | @testing.parameterize(*testing.product({
18 |     'max_episode_steps': [1, 2, 3],
19 | }))
20 | class TestContinuingTimeLimit(unittest.TestCase):
21 | 
22 |     def test(self):
23 |         env = mock.Mock()
24 |         env.reset.side_effect = ['state'] * 2
25 |         # Since info dicts are modified by the wapper, each step call needs to
26 |         # return a new info dict.
27 |         env.step.side_effect = [('state', 0, False, {}) for _ in range(6)]
28 |         env = chainerrl.wrappers.ContinuingTimeLimit(
29 |             env, max_episode_steps=self.max_episode_steps)
30 | 
31 |         env.reset()
32 |         for t in range(2):
33 |             _, _, done, info = env.step(0)
34 |             if t + 1 >= self.max_episode_steps:
35 |                 self.assertTrue(info['needs_reset'])
36 |             else:
37 |                 self.assertFalse(info.get('needs_reset', False))
38 | 
39 |         env.reset()
40 |         for t in range(4):
41 |             _, _, done, info = env.step(0)
42 |             if t + 1 >= self.max_episode_steps:
43 |                 self.assertTrue(info['needs_reset'])
44 |             else:
45 |                 self.assertFalse(info.get('needs_reset', False))
46 | 


--------------------------------------------------------------------------------
/tests/misc_tests/test_conjugate_gradient.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | import unittest
 8 | 
 9 | import chainer
10 | from chainer import testing
11 | from chainer.testing import condition
12 | import numpy as np
13 | 
14 | import chainerrl
15 | 
16 | 
17 | @testing.parameterize(
18 |     *testing.product({
19 |         'n': [1, 5],
20 |         'dtype': [np.float64, np.float32],
21 |     })
22 | )
23 | class TestConjugateGradient(unittest.TestCase):
24 | 
25 |     def _test(self, xp):
26 |         # A must be symmetric and positive-definite
27 |         random_mat = xp.random.normal(size=(self.n, self.n)).astype(self.dtype)
28 |         A = random_mat.dot(random_mat.T)
29 |         x_ans = xp.random.normal(size=self.n).astype(self.dtype)
30 |         b = A.dot(x_ans)
31 | 
32 |         def A_product_func(vec):
33 |             self.assertEqual(xp, chainer.cuda.get_array_module(vec))
34 |             self.assertEqual(vec.shape, b.shape)
35 |             return A.dot(vec)
36 | 
37 |         x = chainerrl.misc.conjugate_gradient(A_product_func, b)
38 |         self.assertEqual(x.dtype, self.dtype)
39 |         self.assertTrue(chainer.cuda.get_array_module(x), xp)
40 |         xp.testing.assert_allclose(x, x_ans, rtol=1e-3)
41 | 
42 |     @condition.retry(3)
43 |     def test_cpu(self):
44 |         self._test(np)
45 | 
46 |     @testing.attr.gpu
47 |     @condition.retry(3)
48 |     def test_gpu(self):
49 |         self._test(chainer.cuda.cupy)
50 | 


--------------------------------------------------------------------------------
/tests/agents_tests/test_double_dqn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import basetest_dqn_like
10 | from basetest_training import _TestBatchTrainingMixin
11 | from chainerrl.agents.double_dqn import DoubleDQN
12 | 
13 | 
14 | class TestDoubleDQNOnDiscreteABC(
15 |         _TestBatchTrainingMixin,
16 |         basetest_dqn_like._TestDQNOnDiscreteABC):
17 | 
18 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
19 |         return DoubleDQN(
20 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
21 |             replay_start_size=100, target_update_interval=100)
22 | 
23 | 
24 | class TestDoubleDQNOnContinuousABC(
25 |         _TestBatchTrainingMixin,
26 |         basetest_dqn_like._TestDQNOnContinuousABC):
27 | 
28 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
29 |         return DoubleDQN(
30 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
31 |             replay_start_size=100, target_update_interval=100)
32 | 
33 | 
34 | # Batch training with recurrent models is currently not supported
35 | class TestDoubleDQNOnDiscretePOABC(basetest_dqn_like._TestDQNOnDiscretePOABC):
36 | 
37 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
38 |         return DoubleDQN(
39 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
40 |             replay_start_size=100, target_update_interval=100,
41 |             episodic_update=True)
42 | 


--------------------------------------------------------------------------------
/tests/misc_tests/test_batch_states.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | import unittest
 8 | 
 9 | import chainer
10 | from chainer import testing
11 | import numpy as np
12 | 
13 | import chainerrl
14 | 
15 | 
16 | class TestBatchStates(unittest.TestCase):
17 | 
18 |     def _test(self, xp):
19 | 
20 |         # state: ((2,2)-shaped array, integer, (1,)-shaped array)
21 |         states = [
22 |             (np.arange(4).reshape((2, 2)), 0, np.zeros(1)),
23 |             (np.arange(4).reshape((2, 2)) + 1, 1, np.zeros(1) + 1),
24 |         ]
25 | 
26 |         def phi(state):
27 |             return state[0] * 2, state[1], state[2] * 3
28 | 
29 |         batch = chainerrl.misc.batch_states(states, xp=xp, phi=phi)
30 |         self.assertIsInstance(batch, tuple)
31 |         batch_a, batch_b, batch_c = batch
32 |         xp.testing.assert_allclose(
33 |             batch_a,
34 |             xp.asarray([
35 |                 [[0, 2],
36 |                  [4, 6]],
37 |                 [[2, 4],
38 |                  [6, 8]],
39 |             ])
40 |         )
41 |         xp.testing.assert_allclose(
42 |             batch_b,
43 |             xp.asarray([0, 1])
44 |         )
45 |         xp.testing.assert_allclose(
46 |             batch_c,
47 |             xp.asarray([
48 |                 [0],
49 |                 [3],
50 |             ])
51 |         )
52 | 
53 |     def test_cpu(self):
54 |         self._test(np)
55 | 
56 |     @testing.attr.gpu
57 |     def test_gpu(self):
58 |         self._test(chainer.cuda.cupy)
59 | 


--------------------------------------------------------------------------------
/tests/agents_tests/basetest_agents.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | import os
 8 | import tempfile
 9 | import unittest
10 | 
11 | from chainer import testing
12 | 
13 | from chainerrl.envs.abc import ABC
14 | from chainerrl.experiments.train_agent import train_agent
15 | 
16 | 
17 | class _TestAgentInterface(unittest.TestCase):
18 | 
19 |     def setUp(self):
20 |         self.env = ABC(discrete=self.discrete,
21 |                        partially_observable=self.partially_observable,
22 |                        episodic=self.episodic)
23 | 
24 |     def create_agent(self, env):
25 |         raise NotImplementedError()
26 | 
27 |     def test_save_load(self):
28 |         a = self.create_agent(self.env)
29 |         dirname = tempfile.mkdtemp()
30 |         a.save(dirname)
31 |         self.assertTrue(os.path.exists(dirname))
32 |         b = self.create_agent(self.env)
33 |         b.load(dirname)
34 | 
35 |     def test_run_episode(self):
36 |         agent = self.create_agent(self.env)
37 |         done = False
38 |         obs = self.env.reset()
39 |         t = 0
40 |         while t < 10 and not done:
41 |             a = agent.act(obs)
42 |             obs, r, done, info = self.env.step(a)
43 |             t += 1
44 | 
45 |     @testing.attr.slow
46 |     def test_train(self):
47 |         agent = self.create_agent(self.env)
48 |         train_agent(
49 |             agent=agent,
50 |             env=self.env,
51 |             steps=2000,
52 |             outdir=tempfile.mkdtemp(),
53 |             max_episode_len=10)
54 | 


--------------------------------------------------------------------------------
/tests/agents_tests/test_residual_dqn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | 
 8 | import basetest_dqn_like as base
 9 | from basetest_training import _TestBatchTrainingMixin
10 | from chainerrl.agents.residual_dqn import ResidualDQN
11 | 
12 | 
13 | class TestResidualDQNOnDiscreteABC(
14 |         _TestBatchTrainingMixin,
15 |         base._TestDQNOnDiscreteABC):
16 | 
17 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
18 |         return ResidualDQN(
19 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
20 |             replay_start_size=100, target_update_interval=100,
21 |             grad_scale=1e-1)
22 | 
23 | 
24 | class TestResidualDQNOnContinuousABC(
25 |         _TestBatchTrainingMixin,
26 |         base._TestDQNOnContinuousABC):
27 | 
28 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
29 |         return ResidualDQN(
30 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
31 |             replay_start_size=100, target_update_interval=100,
32 |             grad_scale=1e-1)
33 | 
34 | 
35 | # Batch training with recurrent models is currently not supported
36 | class TestResidualDQNOnDiscretePOABC(base._TestDQNOnDiscretePOABC):
37 | 
38 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
39 |         return ResidualDQN(
40 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
41 |             replay_start_size=100, target_update_interval=100,
42 |             episodic_update=True,
43 |             grad_scale=1e-1)
44 | 


--------------------------------------------------------------------------------
/tools/plot_scores.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()
 7 | import argparse
 8 | import os
 9 | 
10 | import matplotlib
11 | matplotlib.use('Agg')  # Needed to run without X-server
12 | import matplotlib.pyplot as plt
13 | import pandas as pd
14 | 
15 | 
16 | def main():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--title', type=str, default='')
19 |     parser.add_argument('--file', action='append', dest='files',
20 |                         default=[], type=str,
21 |                         help='specify paths of scores.txt')
22 |     parser.add_argument('--label', action='append', dest='labels',
23 |                         default=[], type=str,
24 |                         help='specify labels for scores.txt files')
25 |     args = parser.parse_args()
26 | 
27 |     assert len(args.files) > 0
28 |     assert len(args.labels) == len(args.files)
29 | 
30 |     for fpath, label in zip(args.files, args.labels):
31 |         if os.path.isdir(fpath):
32 |             fpath = os.path.join(fpath, 'scores.txt')
33 |         assert os.path.exists(fpath)
34 |         scores = pd.read_csv(fpath, delimiter='\t')
35 |         plt.plot(scores['steps'], scores['mean'], label=label)
36 | 
37 |     plt.xlabel('steps')
38 |     plt.ylabel('score')
39 |     plt.legend(loc='best')
40 |     if args.title:
41 |         plt.title(args.title)
42 | 
43 |     fig_fname = args.files[0] + args.title + '.png'
44 |     plt.savefig(fig_fname)
45 |     print('Saved a figure as {}'.format(fig_fname))
46 | 
47 | if __name__ == '__main__':
48 |     main()
49 | 


--------------------------------------------------------------------------------
/chainerrl/links/mlp.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import chainer
10 | from chainer import functions as F
11 | from chainer import links as L
12 | 
13 | from chainerrl.initializers import LeCunNormal
14 | 
15 | 
16 | class MLP(chainer.Chain):
17 |     """Multi-Layer Perceptron"""
18 | 
19 |     def __init__(self, in_size, out_size, hidden_sizes, nonlinearity=F.relu,
20 |                  last_wscale=1):
21 |         self.in_size = in_size
22 |         self.out_size = out_size
23 |         self.hidden_sizes = hidden_sizes
24 |         self.nonlinearity = nonlinearity
25 | 
26 |         super().__init__()
27 |         with self.init_scope():
28 |             if hidden_sizes:
29 |                 hidden_layers = []
30 |                 hidden_layers.append(L.Linear(in_size, hidden_sizes[0]))
31 |                 for hin, hout in zip(hidden_sizes, hidden_sizes[1:]):
32 |                     hidden_layers.append(L.Linear(hin, hout))
33 |                 self.hidden_layers = chainer.ChainList(*hidden_layers)
34 |                 self.output = L.Linear(hidden_sizes[-1], out_size,
35 |                                        initialW=LeCunNormal(last_wscale))
36 |             else:
37 |                 self.output = L.Linear(in_size, out_size,
38 |                                        initialW=LeCunNormal(last_wscale))
39 | 
40 |     def __call__(self, x):
41 |         h = x
42 |         if self.hidden_sizes:
43 |             for l in self.hidden_layers:
44 |                 h = self.nonlinearity(l(h))
45 |         return self.output(h)
46 | 


--------------------------------------------------------------------------------
/chainerrl/envs/serial_vector_env.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import numpy as np
10 | 
11 | import chainerrl
12 | 
13 | 
14 | class SerialVectorEnv(chainerrl.env.VectorEnv):
15 |     """VectorEnv where each env is run sequentially.
16 | 
17 |     The purpose of this VectorEnv is to help debugging. For speed, you should
18 |     use MultiprocessVectorEnv if possible.
19 | 
20 |     Args:
21 |         env_fns (list of gym.Env): List of gym.Env.
22 |     """
23 | 
24 |     def __init__(self, envs):
25 |         self.envs = envs
26 |         self.last_obs = [None] * self.num_envs
27 |         self.action_space = envs[0].action_space
28 |         self.observation_space = envs[0].observation_space
29 |         self.spec = envs[0].observation_space
30 | 
31 |     def step(self, actions):
32 |         results = [env.step(a) for env, a in zip(self.envs, actions)]
33 |         self.last_obs, rews, dones, infos = zip(*results)
34 |         return self.last_obs, rews, dones, infos
35 | 
36 |     def reset(self, mask=None):
37 |         if mask is None:
38 |             mask = np.zeros(self.num_envs)
39 |         obs = [env.reset() if not m else o
40 |                for m, env, o in zip(mask, self.envs, self.last_obs)]
41 |         self.last_obs = obs
42 |         return obs
43 | 
44 |     def seed(self, seeds):
45 |         for env, seed in zip(self.envs, seeds):
46 |             env.seed(seed)
47 | 
48 |     def close(self):
49 |         for env in self.envs:
50 |             env.close()
51 | 
52 |     @property
53 |     def num_envs(self):
54 |         return len(self.envs)
55 | 


--------------------------------------------------------------------------------
/chainerrl/wrappers/continuing_time_limit.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import gym
10 | 
11 | 
12 | class ContinuingTimeLimit(gym.Wrapper):
13 |     """TimeLimit wrapper for continuing environments.
14 | 
15 |     This is similar gym.wrappers.TimeLimit, which sets a time limit for
16 |     each episode, except that done=False is returned and that
17 |     info['needs_reset'] is set to True when past the limit.
18 | 
19 |     Code that calls env.step is responsible for checking the info dict, the
20 |     fourth returned value, and resetting the env if it has the 'needs_reset'
21 |     key and its value is True.
22 | 
23 |     Args:
24 |         env (gym.Env): Env to wrap.
25 |         max_episode_steps (int): Maximum number of timesteps during an episode,
26 |             after which the env needs a reset.
27 |     """
28 | 
29 |     def __init__(self, env, max_episode_steps):
30 |         super(ContinuingTimeLimit, self).__init__(env)
31 |         self._max_episode_steps = max_episode_steps
32 | 
33 |         self._elapsed_steps = None
34 | 
35 |     def step(self, action):
36 |         assert self._elapsed_steps is not None,\
37 |             "Cannot call env.step() before calling reset()"
38 |         observation, reward, done, info = self.env.step(action)
39 |         self._elapsed_steps += 1
40 | 
41 |         if self._max_episode_steps <= self._elapsed_steps:
42 |             info['needs_reset'] = True
43 | 
44 |         return observation, reward, done, info
45 | 
46 |     def reset(self):
47 |         self._elapsed_steps = 0
48 |         return self.env.reset()
49 | 


--------------------------------------------------------------------------------
/chainerrl/wrappers/randomize_action.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import gym
10 | import numpy as np
11 | 
12 | 
13 | class RandomizeAction(gym.ActionWrapper):
14 |     """Apply a random action instead of the one sent by the agent.
15 | 
16 |     This wrapper can be used to make a stochastic env. The common use is
17 |     for evaluation in Atari environments, where actions are replaced with
18 |     random ones with a low probability.
19 | 
20 |     Only gym.spaces.Discrete is supported as an action space.
21 | 
22 |     For exploration during training, use explorers like
23 |     chainerrl.explorers.ConstantEpsilonGreedy instead of this wrapper.
24 | 
25 |     Args:
26 |         env (gym.Env): Env to wrap.
27 |         random_fraction (float): Fraction of actions that will be replaced
28 |             with a random action. It must be in [0, 1].
29 |     """
30 | 
31 |     def __init__(self, env, random_fraction):
32 |         super().__init__(env)
33 |         assert 0 <= random_fraction <= 1
34 |         assert isinstance(env.action_space, gym.spaces.Discrete),\
35 |             'RandomizeAction supports only gym.spaces.Discrete as an action space'  # NOQA
36 |         self._random_fraction = random_fraction
37 |         self._np_random = np.random.RandomState()
38 | 
39 |     def _action(self, action):
40 |         if self._np_random.rand() < self._random_fraction:
41 |             return self._np_random.randint(self.env.action_space.n)
42 |         else:
43 |             return action
44 | 
45 |     def seed(self, seed):
46 |         super().seed(seed)
47 |         self._np_random.seed(seed)
48 | 


--------------------------------------------------------------------------------
/tests/optimizer_tests/test_nonbias_weight_decay.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import unittest
10 | 
11 | import chainer
12 | import chainer.links as L
13 | from chainer import testing
14 | import numpy as np
15 | 
16 | import chainerrl
17 | 
18 | 
19 | @testing.parameterize(*testing.product(
20 |     {
21 |         'lr': [1.0, 0.1],
22 |         'weight_decay_rate': [0.1, 0.05]
23 |     }
24 | ))
25 | class TestNonbiasWeightDecay(unittest.TestCase):
26 | 
27 |     def _test(self, gpu):
28 | 
29 |         model = chainer.Chain(
30 |             a=L.Linear(1, 2, initialW=3, initial_bias=3),
31 |             b=chainer.Chain(c=L.Linear(2, 3, initialW=4, initial_bias=4)),
32 |         )
33 |         if gpu >= 0:
34 |             model.to_gpu(gpu)
35 |             xp = model.xp
36 |         else:
37 |             xp = np
38 |         optimizer = chainer.optimizers.SGD(self.lr)
39 |         optimizer.setup(model)
40 |         optimizer.add_hook(
41 |             chainerrl.optimizers.NonbiasWeightDecay(
42 |                 rate=self.weight_decay_rate))
43 |         optimizer.update(lambda: chainer.Variable(xp.asarray(0.0)))
44 |         decay_factor = 1 - self.lr * self.weight_decay_rate
45 |         xp.testing.assert_allclose(model.a.W.array, 3 * decay_factor)
46 |         xp.testing.assert_allclose(model.a.b.array, 3)
47 |         xp.testing.assert_allclose(model.b.c.W.array, 4 * decay_factor)
48 |         xp.testing.assert_allclose(model.b.c.b.array, 4)
49 | 
50 |     def test_cpu(self):
51 |         self._test(gpu=-1)
52 | 
53 |     @testing.attr.gpu
54 |     def test_gpu(self):
55 |         self._test(gpu=0)
56 | 


--------------------------------------------------------------------------------
/chainerrl/links/noisy_chain.py:
--------------------------------------------------------------------------------
 1 | """Noisy Networks
 2 | 
 3 | See http://arxiv.org/abs/1706.10295
 4 | """
 5 | 
 6 | import chainer
 7 | from chainer.links import Linear
 8 | 
 9 | from chainerrl.links.noisy_linear import FactorizedNoisyLinear
10 | from chainerrl.links.sequence import Sequence
11 | 
12 | 
13 | def to_factorized_noisy(link, *args, **kwargs):
14 |     """Add noisiness to components of given link
15 | 
16 |     Currently this function supports L.Linear (with and without bias)
17 |     """
18 | 
19 |     def func_to_factorized_noisy(link):
20 |         if isinstance(link, Linear):
21 |             return FactorizedNoisyLinear(link, *args, **kwargs)
22 |         else:
23 |             return link
24 | 
25 |     _map_links(func_to_factorized_noisy, link)
26 | 
27 | 
28 | def _map_links(func, link):
29 |     if isinstance(link, chainer.Chain):
30 |         children_names = link._children.copy()
31 |         for name in children_names:
32 |             child = getattr(link, name)
33 |             new_child = func(child)
34 |             if new_child is child:
35 |                 _map_links(func, child)
36 |             else:
37 |                 delattr(link, name)
38 |                 with link.init_scope():
39 |                     setattr(link, name, new_child)
40 |     elif isinstance(link, chainer.ChainList):
41 |         children = link._children
42 |         for i in range(len(children)):
43 |             child = children[i]
44 |             new_child = func(child)
45 |             if new_child is child:
46 |                 _map_links(func, child)
47 |             else:
48 |                 # mimic ChainList.add_link
49 |                 children[i] = new_child
50 |                 children[i].name = str(i)
51 | 
52 |                 if isinstance(link, Sequence):
53 |                     # assumes i-th layer corresponds with i-th child
54 |                     link.layers[i] = new_child
55 | 


--------------------------------------------------------------------------------
/tests/explorers_tests/test_boltzmann.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | 
 8 | import unittest
 9 | 
10 | import chainer
11 | import numpy as np
12 | 
13 | import chainerrl
14 | 
15 | 
16 | def count_actions_selected_by_boltzmann(T, q_values):
17 | 
18 |     def greedy_action_func():
19 |         raise RuntimeError('Must not be called')
20 | 
21 |     explorer = chainerrl.explorers.Boltzmann(T=T)
22 |     action_value = chainerrl.action_value.DiscreteActionValue(q_values)
23 | 
24 |     action_count = [0] * 3
25 | 
26 |     for t in range(10000):
27 |         a = explorer.select_action(t, greedy_action_func, action_value)
28 |         action_count[a] += 1
29 | 
30 |     return action_count
31 | 
32 | 
33 | class TestBoltzmann(unittest.TestCase):
34 | 
35 |     def test_boltzmann(self):
36 | 
37 |         # T=1
38 |         q_values = chainer.Variable(np.asarray([[-1, 1, 0]], dtype=np.float32))
39 |         action_count = count_actions_selected_by_boltzmann(1, q_values)
40 |         print('T=1', action_count)
41 |         # Actions with larger values must be selected more often
42 |         self.assertGreater(action_count[1], action_count[2])
43 |         self.assertGreater(action_count[2], action_count[0])
44 | 
45 |         # T=0.5
46 |         action_count_t05 = count_actions_selected_by_boltzmann(0.5, q_values)
47 |         print('T=0.5', action_count_t05)
48 |         # Actions with larger values must be selected more often
49 |         self.assertGreater(action_count_t05[1], action_count_t05[2])
50 |         self.assertGreater(action_count_t05[2], action_count_t05[0])
51 | 
52 |         # T=0.5 must be more greedy than T=1
53 |         self.assertGreater(action_count_t05[1], action_count[1])
54 | 


--------------------------------------------------------------------------------
/tests/misc_tests/test_random_seed.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | from builtins import *  # NOQA
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import random
10 | import unittest
11 | 
12 | import chainer
13 | from chainer.testing import attr
14 | import numpy as np
15 | 
16 | import chainerrl
17 | 
18 | 
19 | class TestSetRandomSeed(unittest.TestCase):
20 | 
21 |     def test_random(self):
22 |         chainerrl.misc.set_random_seed(0)
23 |         seed0_0 = random.random()
24 |         chainerrl.misc.set_random_seed(1)
25 |         seed1_0 = random.random()
26 |         chainerrl.misc.set_random_seed(0)
27 |         seed0_1 = random.random()
28 |         chainerrl.misc.set_random_seed(1)
29 |         seed1_1 = random.random()
30 |         self.assertEqual(seed0_0, seed0_1)
31 |         self.assertEqual(seed1_0, seed1_1)
32 |         self.assertNotEqual(seed0_0, seed1_0)
33 | 
34 |     def _test_xp_random(self, xp, gpus):
35 |         chainerrl.misc.set_random_seed(0, gpus=gpus)
36 |         seed0_0 = xp.random.rand()
37 |         chainerrl.misc.set_random_seed(1, gpus=gpus)
38 |         seed1_0 = xp.random.rand()
39 |         chainerrl.misc.set_random_seed(0, gpus=gpus)
40 |         seed0_1 = xp.random.rand()
41 |         chainerrl.misc.set_random_seed(1, gpus=gpus)
42 |         seed1_1 = xp.random.rand()
43 |         self.assertEqual(seed0_0, seed0_1)
44 |         self.assertEqual(seed1_0, seed1_1)
45 |         self.assertNotEqual(seed0_0, seed1_0)
46 | 
47 |     def test_numpy_random(self):
48 |         self._test_xp_random(np, gpus=())
49 |         # It should ignore negative device IDs
50 |         self._test_xp_random(np, gpus=(-1,))
51 | 
52 |     @attr.gpu
53 |     def test_cupy_random(self):
54 |         self._test_xp_random(chainer.cuda.cupy, gpus=(0,))
55 | 


--------------------------------------------------------------------------------
/tests/links_tests/test_mlp_bn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import unittest
10 | 
11 | import chainer
12 | import chainer.functions as F
13 | from chainer import testing
14 | from chainer.testing import attr
15 | import numpy as np
16 | 
17 | import chainerrl
18 | 
19 | 
20 | @testing.parameterize(
21 |     *testing.product({
22 |         'in_size': [1, 5],
23 |         'out_size': [1, 3],
24 |         'hidden_sizes': [(), (1,), (1, 1), (7, 8)],
25 |         'normalize_input': [True, False],
26 |         'normalize_output': [True, False],
27 |         'nonlinearity': ['relu', 'elu'],
28 |         'last_wscale': [1, 1e-3],
29 |     })
30 | )
31 | class TestMLPBN(unittest.TestCase):
32 | 
33 |     def _test_call(self, gpu):
34 |         nonlinearity = getattr(F, self.nonlinearity)
35 |         mlp = chainerrl.links.MLPBN(
36 |             in_size=self.in_size,
37 |             out_size=self.out_size,
38 |             hidden_sizes=self.hidden_sizes,
39 |             normalize_input=self.normalize_input,
40 |             normalize_output=self.normalize_output,
41 |             nonlinearity=nonlinearity,
42 |             last_wscale=self.last_wscale,
43 |         )
44 |         batch_size = 7
45 |         x = np.random.rand(batch_size, self.in_size).astype(np.float32)
46 |         if gpu >= 0:
47 |             mlp.to_gpu(gpu)
48 |             x = chainer.cuda.to_gpu(x)
49 |         y = mlp(x)
50 |         self.assertEqual(y.shape, (batch_size, self.out_size))
51 |         self.assertEqual(chainer.cuda.get_array_module(y),
52 |                          chainer.cuda.get_array_module(x))
53 | 
54 |     def test_call_cpu(self):
55 |         self._test_call(gpu=-1)
56 | 
57 |     @attr.gpu
58 |     def test_call_gpu(self):
59 |         self._test_call(gpu=0)
60 | 


--------------------------------------------------------------------------------
/chainerrl/functions/weighted_sum_arrays.py:
--------------------------------------------------------------------------------
 1 | from chainer import cuda
 2 | from chainer import function
 3 | from chainer import utils
 4 | from chainer.utils import type_check
 5 | 
 6 | 
 7 | class WeightedSumArrays(function.Function):
 8 |     """Element-wise weighted sum of input arrays."""
 9 | 
10 |     def __init__(self, weights):
11 |         self.weights = weights
12 | 
13 |     def check_type_forward(self, in_types):
14 |         type_check.expect(
15 |             in_types[0].dtype.kind == 'f',
16 |         )
17 | 
18 |     def forward_cpu(self, inputs):
19 |         y = sum(w * x for w, x in zip(self.weights, inputs))
20 |         return utils.force_array(y),
21 | 
22 |     def backward(self, inputs, grads):
23 |         return [w * grads[0] for w in self.weights]
24 | 
25 |     def forward_gpu(self, inputs):
26 |         n = len(inputs)
27 |         ptrs = cuda.cupy.asarray([x.data.ptr for x in inputs],
28 |                                  dtype=cuda.cupy.int64)
29 |         ws = cuda.cupy.asarray(self.weights, dtype=cuda.cupy.float32)
30 |         y = cuda.elementwise(
31 |             'T x0, int64 xs, raw W ws, int32 n_xs',
32 |             'T y',
33 |             'float** xs_ = (float**) xs;'
34 |             'y = 0;'
35 |             'for (size_t j = 0; j < n_xs; ++j) {'
36 |             '  y += xs_[j][i] * ws[j];'
37 |             '}',
38 |             'weighted_sum_arrays'.format(n))(inputs[0],
39 |                                              ptrs.data.ptr,
40 |                                              ws,
41 |                                              len(ptrs))
42 |         return y,
43 | 
44 | 
45 | def weighted_sum_arrays(xs, weights):
46 |     """Element-wise weighted sum of input arrays.
47 | 
48 |     Args:
49 |         xs (tuple of ~chainer.Variable or ndarray): Input arrays to be summed.
50 |         weights (list of float): Weight coefficients of input arrays.
51 | 
52 |     Returns:
53 |         ~chainer.Variable: Output variable.
54 |     """
55 |     return WeightedSumArrays(weights)(*xs)
56 | 


--------------------------------------------------------------------------------
/chainerrl/q_functions/dueling_dqn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import chainer
10 | from chainer import functions as F
11 | from chainer import links as L
12 | 
13 | from chainerrl import action_value
14 | from chainerrl.links.mlp import MLP
15 | from chainerrl.q_function import StateQFunction
16 | 
17 | 
18 | class DuelingDQN(chainer.Chain, StateQFunction):
19 |     """Dueling Q-Network
20 | 
21 |     See: http://arxiv.org/abs/1511.06581
22 |     """
23 | 
24 |     def __init__(self, n_actions, n_input_channels=4,
25 |                  activation=F.relu, bias=0.1):
26 |         self.n_actions = n_actions
27 |         self.n_input_channels = n_input_channels
28 |         self.activation = activation
29 | 
30 |         super().__init__()
31 |         with self.init_scope():
32 |             self.conv_layers = chainer.ChainList(
33 |                 L.Convolution2D(n_input_channels, 32, 8, stride=4,
34 |                                 initial_bias=bias),
35 |                 L.Convolution2D(32, 64, 4, stride=2, initial_bias=bias),
36 |                 L.Convolution2D(64, 64, 3, stride=1, initial_bias=bias))
37 | 
38 |             self.a_stream = MLP(3136, n_actions, [512])
39 |             self.v_stream = MLP(3136, 1, [512])
40 | 
41 |     def __call__(self, x):
42 |         h = x
43 |         for l in self.conv_layers:
44 |             h = self.activation(l(h))
45 | 
46 |         # Advantage
47 |         batch_size = x.shape[0]
48 |         ya = self.a_stream(h)
49 |         mean = F.reshape(
50 |             F.sum(ya, axis=1) / self.n_actions, (batch_size, 1))
51 |         ya, mean = F.broadcast(ya, mean)
52 |         ya -= mean
53 | 
54 |         # State value
55 |         ys = self.v_stream(h)
56 | 
57 |         ya, ys = F.broadcast(ya, ys)
58 |         q = ya + ys
59 |         return action_value.DiscreteActionValue(q)
60 | 


--------------------------------------------------------------------------------
/chainerrl/links/sequence.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import chainer
10 | 
11 | from chainerrl.recurrent import RecurrentChainMixin
12 | 
13 | try:
14 |     # For Python 3.5 and later
15 |     from inspect import Parameter
16 |     from inspect import signature
17 | except Exception:
18 |     from funcsigs import Parameter
19 |     from funcsigs import signature
20 | 
21 | 
22 | def accept_variable_arguments(func):
23 |     for param in signature(func).parameters.values():
24 |         if param.kind in (Parameter.VAR_POSITIONAL,
25 |                           Parameter.VAR_KEYWORD):
26 |             return True
27 |     return False
28 | 
29 | 
30 | class Sequence(chainer.ChainList, RecurrentChainMixin):
31 |     """Sequential callable Link that consists of other Links."""
32 | 
33 |     def __init__(self, *layers):
34 |         self.layers = list(layers)
35 |         links = [layer for layer in layers if isinstance(layer, chainer.Link)]
36 |         # Cache the signatures because it might be slow
37 |         self.argnames = [set(signature(layer).parameters)
38 |                          for layer in layers]
39 |         self.accept_var_args = [accept_variable_arguments(layer)
40 |                                 for layer in layers]
41 |         super().__init__(*links)
42 | 
43 |     def __call__(self, x, **kwargs):
44 |         h = x
45 |         for layer, argnames, accept_var_args in zip(self.layers,
46 |                                                     self.argnames,
47 |                                                     self.accept_var_args):
48 |             if accept_var_args:
49 |                 layer_kwargs = kwargs
50 |             else:
51 |                 layer_kwargs = {k: v for k, v in kwargs.items()
52 |                                 if k in argnames}
53 |             h = layer(h, **layer_kwargs)
54 |         return h
55 | 


--------------------------------------------------------------------------------
/tests/functions_tests/test_sum_arrays.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | import unittest
 8 | 
 9 | import numpy
10 | 
11 | import chainer
12 | from chainer import cuda
13 | from chainer import gradient_check
14 | from chainer import testing
15 | from chainer.testing import attr
16 | 
17 | import chainerrl
18 | 
19 | 
20 | @testing.parameterize(
21 |     *testing.product({
22 |         'batchsize': [1, 3],
23 |         'n': [1, 2, 7],
24 |         'shape': [(1,), (1, 1), (2,), (2, 3)],
25 |     })
26 | )
27 | class TestSumArrays(unittest.TestCase):
28 | 
29 |     def setUp(self):
30 |         self.batch_size = 5
31 |         array_shape = (self.batchsize,) + self.shape
32 |         self.xs = [numpy.random.uniform(
33 |             -1, 1, array_shape).astype(numpy.float32)
34 |             for _ in range(self.n)]
35 |         self.gy = numpy.random.uniform(
36 |             -1, 1, array_shape).astype(numpy.float32)
37 | 
38 |     def check_forward(self, xs):
39 |         y = chainerrl.functions.sum_arrays(xs)
40 |         correct_y = sum(self.xs)
41 |         gradient_check.assert_allclose(correct_y, cuda.to_cpu(y.array))
42 | 
43 |     def test_forward_cpu(self):
44 |         self.check_forward(self.xs)
45 | 
46 |     @attr.gpu
47 |     def test_forward_gpu(self):
48 |         xs_gpu = [chainer.cuda.to_gpu(x) for x in self.xs]
49 |         self.check_forward(xs_gpu)
50 | 
51 |     def check_backward(self, x_data, y_grad):
52 |         gradient_check.check_backward(
53 |             chainerrl.functions.SumArrays(),
54 |             x_data, y_grad, eps=1e-2, rtol=1e-2)
55 | 
56 |     def test_backward_cpu(self):
57 |         self.check_backward(self.xs, self.gy)
58 | 
59 |     @attr.gpu
60 |     def test_backward_gpu(self):
61 |         xs_gpu = [chainer.cuda.to_gpu(x) for x in self.xs]
62 |         self.check_backward(xs_gpu, cuda.to_gpu(self.gy))
63 | 
64 | 
65 | testing.run_module(__name__, __file__)
66 | 


--------------------------------------------------------------------------------
/chainerrl/agents/residual_dqn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | import chainer.functions as F
 9 | 
10 | from chainerrl.agents.dqn import DQN
11 | from chainerrl.functions import scale_grad
12 | 
13 | 
14 | class ResidualDQN(DQN):
15 |     """DQN that allows maxQ also backpropagate gradients."""
16 | 
17 |     def __init__(self, *args, **kwargs):
18 |         self.grad_scale = kwargs.pop('grad_scale', 1.0)
19 |         super().__init__(*args, **kwargs)
20 | 
21 |     def sync_target_network(self):
22 |         pass
23 | 
24 |     def _compute_target_values(self, exp_batch, gamma):
25 | 
26 |         batch_next_state = exp_batch['next_state']
27 | 
28 |         target_next_qout = self.q_function(batch_next_state)
29 |         next_q_max = target_next_qout.max
30 | 
31 |         batch_rewards = exp_batch['reward']
32 |         batch_terminal = exp_batch['is_state_terminal']
33 | 
34 |         return batch_rewards + self.gamma * (1.0 - batch_terminal) * next_q_max
35 | 
36 |     def _compute_y_and_t(self, exp_batch, gamma):
37 | 
38 |         batch_state = exp_batch['state']
39 |         batch_size = len(batch_state)
40 | 
41 |         # Compute Q-values for current states
42 |         qout = self.q_function(batch_state)
43 | 
44 |         batch_actions = exp_batch['action']
45 |         batch_q = F.reshape(qout.evaluate_actions(
46 |             batch_actions), (batch_size, 1))
47 | 
48 |         # Target values must also backprop gradients
49 |         batch_q_target = F.reshape(
50 |             self._compute_target_values(exp_batch, gamma), (batch_size, 1))
51 | 
52 |         return batch_q, scale_grad.scale_grad(batch_q_target, self.grad_scale)
53 | 
54 |     @property
55 |     def saved_attributes(self):
56 |         # ResidualDQN doesn't use target models
57 |         return ('model', 'optimizer')
58 | 
59 |     def input_initial_batch_to_target_model(self, batch):
60 |         pass
61 | 


--------------------------------------------------------------------------------
/tests/links_tests/test_sequence.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import unittest
10 | 
11 | from chainerrl.links import Sequence
12 | 
13 | 
14 | class TestSequence(unittest.TestCase):
15 | 
16 |     def test_call(self):
17 | 
18 |         def func_a(x):
19 |             return x + 1
20 | 
21 |         b_test_mode = [False]
22 | 
23 |         def func_b(x, test=False):
24 |             b_test_mode[0] = test
25 |             return x + 1
26 | 
27 |         c_test_mode = [False]
28 |         c_hoge_mode = [False]
29 | 
30 |         def func_c(x, test=False, hoge=False):
31 |             c_test_mode[0] = test
32 |             c_hoge_mode[0] = hoge
33 |             return x + 1
34 | 
35 |         def _test_call(seq):
36 | 
37 |             out = seq(1)
38 |             self.assertEqual(out, 4)
39 |             self.assertFalse(b_test_mode[0])
40 |             self.assertFalse(c_test_mode[0])
41 |             self.assertFalse(c_hoge_mode[0])
42 | 
43 |             out = seq(1, test=True)
44 |             self.assertEqual(out, 4)
45 |             self.assertTrue(b_test_mode[0])
46 |             self.assertTrue(c_test_mode[0])
47 |             self.assertFalse(c_hoge_mode[0])
48 | 
49 |             out = seq(1, test=True, hoge=True)
50 |             self.assertEqual(out, 4)
51 |             self.assertTrue(b_test_mode[0])
52 |             self.assertTrue(c_test_mode[0])
53 |             self.assertTrue(c_hoge_mode[0])
54 | 
55 |             out = seq(1, test=False, hoge=True)
56 |             self.assertEqual(out, 4)
57 |             self.assertFalse(b_test_mode[0])
58 |             self.assertFalse(c_test_mode[0])
59 |             self.assertTrue(c_hoge_mode[0])
60 | 
61 |         _test_call(Sequence(func_a, func_b, func_c))
62 |         _test_call(Sequence(Sequence(func_a, func_b, func_c)))
63 |         _test_call(Sequence(Sequence(func_a),
64 |                             Sequence(func_b), Sequence(func_c)))
65 | 


--------------------------------------------------------------------------------
/chainerrl/policies/softmax_policy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from logging import getLogger
10 | 
11 | import chainer
12 | from chainer import functions as F
13 | 
14 | from chainerrl import distribution
15 | from chainerrl.links.mlp import MLP
16 | from chainerrl.policy import Policy
17 | 
18 | 
19 | logger = getLogger(__name__)
20 | 
21 | 
22 | class SoftmaxPolicy(chainer.Chain, Policy):
23 |     """Softmax policy that uses Boltzmann distributions.
24 | 
25 |     Args:
26 |         model (chainer.Link):
27 |             Link that is callable and outputs action values.
28 |         beta (float):
29 |             Parameter of Boltzmann distributions.
30 |     """
31 | 
32 |     def __init__(self, model, beta=1.0, min_prob=0.0):
33 |         self.beta = beta
34 |         self.min_prob = min_prob
35 |         super().__init__(model=model)
36 | 
37 |     def __call__(self, x):
38 |         h = self.model(x)
39 |         return distribution.SoftmaxDistribution(
40 |             h, beta=self.beta, min_prob=self.min_prob)
41 | 
42 | 
43 | class FCSoftmaxPolicy(SoftmaxPolicy):
44 |     """Softmax policy that consists of FC layers and rectifiers"""
45 | 
46 |     def __init__(self, n_input_channels, n_actions,
47 |                  n_hidden_layers=0, n_hidden_channels=None,
48 |                  beta=1.0, nonlinearity=F.relu,
49 |                  last_wscale=1.0,
50 |                  min_prob=0.0):
51 |         self.n_input_channels = n_input_channels
52 |         self.n_actions = n_actions
53 |         self.n_hidden_layers = n_hidden_layers
54 |         self.n_hidden_channels = n_hidden_channels
55 |         self.beta = beta
56 | 
57 |         super().__init__(
58 |             model=MLP(n_input_channels,
59 |                       n_actions,
60 |                       (n_hidden_channels,) * n_hidden_layers,
61 |                       nonlinearity=nonlinearity,
62 |                       last_wscale=last_wscale),
63 |             beta=self.beta,
64 |             min_prob=min_prob)
65 | 


--------------------------------------------------------------------------------
/tests/explorers_tests/test_epsilon_greedy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | 
 8 | import logging
 9 | import unittest
10 | 
11 | from chainerrl.explorers import epsilon_greedy
12 | 
13 | 
14 | class TestEpsilonGreedy(unittest.TestCase):
15 | 
16 |     def test_linear_decay_epsilon_greedy(self):
17 | 
18 |         random_action_func_count = [0]
19 |         greedy_action_func_count = [0]
20 | 
21 |         def random_action_func():
22 |             random_action_func_count[0] += 1
23 |             return 0
24 | 
25 |         def greedy_action_func():
26 |             greedy_action_func_count[0] += 1
27 |             return 0
28 | 
29 |         explorer = epsilon_greedy.LinearDecayEpsilonGreedy(1.0, 0.1, 50,
30 |                                                            random_action_func)
31 | 
32 |         explorer.logger.addHandler(logging.StreamHandler())
33 |         explorer.logger.setLevel(logging.DEBUG)
34 | 
35 |         self.assertAlmostEqual(explorer.epsilon, 1.0)
36 | 
37 |         for t in range(100):
38 |             explorer.select_action(t, greedy_action_func)
39 | 
40 |         self.assertAlmostEqual(explorer.epsilon, 0.1)
41 | 
42 |     def test_constant_epsilon_greedy(self):
43 | 
44 |         random_action_func_count = [0]
45 |         greedy_action_func_count = [0]
46 | 
47 |         def random_action_func():
48 |             random_action_func_count[0] += 1
49 |             return 0
50 | 
51 |         def greedy_action_func():
52 |             greedy_action_func_count[0] += 1
53 |             return 0
54 | 
55 |         explorer = epsilon_greedy.ConstantEpsilonGreedy(0.1,
56 |                                                         random_action_func)
57 | 
58 |         explorer.logger.addHandler(logging.StreamHandler())
59 |         explorer.logger.setLevel(logging.DEBUG)
60 | 
61 |         self.assertAlmostEqual(explorer.epsilon, 0.1)
62 | 
63 |         for t in range(100):
64 |             explorer.select_action(t, greedy_action_func)
65 | 
66 |         self.assertAlmostEqual(explorer.epsilon, 0.1)
67 | 


--------------------------------------------------------------------------------
/tests/wrappers_tests/test_cast_observation.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | 
10 | import unittest
11 | 
12 | from chainer import testing
13 | import gym
14 | import numpy as np
15 | 
16 | import chainerrl
17 | 
18 | 
19 | @testing.parameterize(*testing.product({
20 |     'env_id': ['CartPole-v1', 'Pendulum-v0'],
21 |     'dtype': [np.float16, np.float32, np.float64]
22 | }))
23 | class TestCastObservation(unittest.TestCase):
24 | 
25 |     def test_cast_observation(self):
26 |         env = chainerrl.wrappers.CastObservation(
27 |             gym.make(self.env_id), dtype=self.dtype)
28 |         rtol = 1e-3 if self.dtype == np.float16 else 1e-7
29 | 
30 |         obs = env.reset()
31 |         self.assertEqual(env.original_observation.dtype, np.float64)
32 |         self.assertEqual(obs.dtype, self.dtype)
33 |         np.testing.assert_allclose(env.original_observation, obs, rtol=rtol)
34 | 
35 |         obs, r, done, info = env.step(env.action_space.sample())
36 | 
37 |         self.assertEqual(env.original_observation.dtype, np.float64)
38 |         self.assertEqual(obs.dtype, self.dtype)
39 |         np.testing.assert_allclose(env.original_observation, obs, rtol=rtol)
40 | 
41 | 
42 | @testing.parameterize(*testing.product({
43 |     'env_id': ['CartPole-v1', 'Pendulum-v0'],
44 | }))
45 | class TestCastObservationToFloat32(unittest.TestCase):
46 | 
47 |     def test_cast_observation(self):
48 |         env = chainerrl.wrappers.CastObservationToFloat32(
49 |             gym.make(self.env_id))
50 | 
51 |         obs = env.reset()
52 |         self.assertEqual(env.original_observation.dtype, np.float64)
53 |         self.assertEqual(obs.dtype, np.float32)
54 |         np.testing.assert_allclose(env.original_observation, obs)
55 | 
56 |         obs, r, done, info = env.step(env.action_space.sample())
57 |         self.assertEqual(env.original_observation.dtype, np.float64)
58 |         self.assertEqual(obs.dtype, np.float32)
59 |         np.testing.assert_allclose(env.original_observation, obs)
60 | 


--------------------------------------------------------------------------------
/chainerrl/agents/double_pal.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import chainer
10 | from chainer import functions as F
11 | 
12 | from chainerrl.agents import pal
13 | from chainerrl.recurrent import state_kept
14 | 
15 | 
16 | class DoublePAL(pal.PAL):
17 | 
18 |     def _compute_y_and_t(self, exp_batch):
19 | 
20 |         batch_state = exp_batch['state']
21 |         batch_size = len(exp_batch['reward'])
22 | 
23 |         qout = self.q_function(batch_state)
24 | 
25 |         batch_actions = exp_batch['action']
26 |         batch_q = qout.evaluate_actions(batch_actions)
27 | 
28 |         # Compute target values
29 | 
30 |         with chainer.no_backprop_mode():
31 |             target_qout = self.target_q_function(batch_state)
32 | 
33 |             batch_next_state = exp_batch['next_state']
34 | 
35 |             with state_kept(self.q_function):
36 |                 next_qout = self.q_function(batch_next_state)
37 | 
38 |             with state_kept(self.target_q_function):
39 |                 target_next_qout = self.target_q_function(
40 |                     batch_next_state)
41 |             next_q_max = F.reshape(target_next_qout.evaluate_actions(
42 |                 next_qout.greedy_actions), (batch_size,))
43 | 
44 |             batch_rewards = exp_batch['reward']
45 |             batch_terminal = exp_batch['is_state_terminal']
46 | 
47 |             # T Q: Bellman operator
48 |             t_q = batch_rewards + exp_batch['discount'] * \
49 |                 (1.0 - batch_terminal) * next_q_max
50 | 
51 |             # T_PAL Q: persistent advantage learning operator
52 |             cur_advantage = F.reshape(
53 |                 target_qout.compute_advantage(batch_actions), (batch_size,))
54 |             next_advantage = F.reshape(
55 |                 target_next_qout.compute_advantage(batch_actions),
56 |                 (batch_size,))
57 |             tpal_q = t_q + self.alpha * \
58 |                 F.maximum(cur_advantage, next_advantage)
59 | 
60 |         return batch_q, tpal_q
61 | 


--------------------------------------------------------------------------------
/chainerrl/functions/invert_gradients.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import division
 3 | from __future__ import absolute_import
 4 | from future import standard_library
 5 | standard_library.install_aliases()  # NOQA
 6 | 
 7 | from chainer import cuda
 8 | from chainer import function
 9 | from chainer.utils import type_check
10 | 
11 | 
12 | class InvertGradients(function.Function):
13 |     """Inverts gradients of values exceeding a given range.
14 | 
15 |     See: http://arxiv.org/abs/1511.04143
16 |     """
17 | 
18 |     def __init__(self, range_min, range_max):
19 |         self.range_min = range_min
20 |         self.range_max = range_max
21 |         self.range_width = self.range_max - self.range_min
22 |         assert (self.range_width > 0).all()
23 | 
24 |     def check_type_forward(self, in_types):
25 |         type_check.expect(in_types.size() == 1,)
26 | 
27 |     @property
28 |     def label(self):
29 |         return 'InvertGradients'
30 | 
31 |     def forward(self, inputs):
32 |         return inputs
33 | 
34 |     def backward(self, inputs, grad_outputs):
35 |         x, = inputs
36 |         gy, = grad_outputs
37 |         # In chainer, update will be like x.array -= lr * x.grad,
38 |         # which means negative gradients will increase values.
39 |         increasing = (gy < 0).astype(gy.dtype)
40 |         gx = gy.copy()
41 |         gx *= ((self.range_max - x) / self.range_width * increasing +
42 |                (x - self.range_min) / self.range_width * (1 - increasing))
43 |         return gx,
44 | 
45 | 
46 | def invert_gradients(x, range_min, range_max):
47 |     """Inverts gradients of values exceeding a given range.
48 | 
49 |     See: http://arxiv.org/abs/1511.04143
50 | 
51 |     Args:
52 |         x (chainer.Variable or ndarray): Input value.
53 |         range_min (chainer.Variable or ndarray): Minimum of the value range.
54 |         range_max (chainer.Variable or ndarray): Maximum of the value range.
55 |     Returns:
56 |         The same value as x, except that the gradients backpropagated is scaled
57 |         and inverted so that values would be in a given range after update.
58 |     """
59 |     xp = cuda.get_array_module(x, x.array)
60 |     return InvertGradients(xp.asarray(range_min), xp.asarray(range_max))(x)
61 | 


--------------------------------------------------------------------------------
/tests/functions_tests/test_weighted_sum_arrays.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | import unittest
 8 | 
 9 | import numpy
10 | 
11 | import chainer
12 | from chainer import cuda
13 | from chainer import gradient_check
14 | from chainer import testing
15 | from chainer.testing import attr
16 | 
17 | import chainerrl
18 | 
19 | 
20 | @testing.parameterize(
21 |     *testing.product({
22 |         'batchsize': [1, 3],
23 |         'n': [1, 2, 7],
24 |         'shape': [(1,), (1, 1), (2,), (2, 3)],
25 |     })
26 | )
27 | class TestSumArrays(unittest.TestCase):
28 | 
29 |     def setUp(self):
30 |         self.batch_size = 5
31 |         array_shape = (self.batchsize,) + self.shape
32 |         self.xs = [numpy.random.uniform(
33 |             -1, 1, array_shape).astype(numpy.float32)
34 |             for _ in range(self.n)]
35 |         self.weights = [numpy.random.rand() for _ in range(self.n)]
36 |         self.gy = numpy.random.uniform(
37 |             -1, 1, array_shape).astype(numpy.float32)
38 | 
39 |     def check_forward(self, xs):
40 |         y = chainerrl.functions.weighted_sum_arrays(xs, weights=self.weights)
41 |         correct_y = sum(x * w for x, w in zip(self.xs, self.weights))
42 |         gradient_check.assert_allclose(correct_y, cuda.to_cpu(y.array))
43 | 
44 |     def test_forward_cpu(self):
45 |         self.check_forward(self.xs)
46 | 
47 |     @attr.gpu
48 |     def test_forward_gpu(self):
49 |         xs_gpu = [chainer.cuda.to_gpu(x) for x in self.xs]
50 |         self.check_forward(xs_gpu)
51 | 
52 |     def check_backward(self, x_data, y_grad):
53 |         gradient_check.check_backward(
54 |             chainerrl.functions.WeightedSumArrays(self.weights),
55 |             x_data, y_grad, eps=1e-2, rtol=1e-2)
56 | 
57 |     def test_backward_cpu(self):
58 |         self.check_backward(self.xs, self.gy)
59 | 
60 |     @attr.gpu
61 |     def test_backward_gpu(self):
62 |         xs_gpu = [chainer.cuda.to_gpu(x) for x in self.xs]
63 |         self.check_backward(xs_gpu, cuda.to_gpu(self.gy))
64 | 
65 | 
66 | testing.run_module(__name__, __file__)
67 | 


--------------------------------------------------------------------------------
/chainerrl/links/dqn_head.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | import chainer
 9 | from chainer import functions as F
10 | from chainer import links as L
11 | 
12 | 
13 | class NatureDQNHead(chainer.ChainList):
14 |     """DQN's head (Nature version)"""
15 | 
16 |     def __init__(self, n_input_channels=4, n_output_channels=512,
17 |                  activation=F.relu, bias=0.1):
18 |         self.n_input_channels = n_input_channels
19 |         self.activation = activation
20 |         self.n_output_channels = n_output_channels
21 | 
22 |         layers = [
23 |             L.Convolution2D(n_input_channels, 32, 8, stride=4,
24 |                             initial_bias=bias),
25 |             L.Convolution2D(32, 64, 4, stride=2, initial_bias=bias),
26 |             L.Convolution2D(64, 64, 3, stride=1, initial_bias=bias),
27 |             L.Linear(3136, n_output_channels, initial_bias=bias),
28 |         ]
29 | 
30 |         super(NatureDQNHead, self).__init__(*layers)
31 | 
32 |     def __call__(self, state):
33 |         h = state
34 |         for layer in self:
35 |             h = self.activation(layer(h))
36 |         return h
37 | 
38 | 
39 | class NIPSDQNHead(chainer.ChainList):
40 |     """DQN's head (NIPS workshop version)"""
41 | 
42 |     def __init__(self, n_input_channels=4, n_output_channels=256,
43 |                  activation=F.relu, bias=0.1):
44 |         self.n_input_channels = n_input_channels
45 |         self.activation = activation
46 |         self.n_output_channels = n_output_channels
47 | 
48 |         layers = [
49 |             L.Convolution2D(n_input_channels, 16, 8, stride=4,
50 |                             initial_bias=bias),
51 |             L.Convolution2D(16, 32, 4, stride=2, initial_bias=bias),
52 |             L.Linear(2592, n_output_channels, initial_bias=bias),
53 |         ]
54 | 
55 |         super(NIPSDQNHead, self).__init__(*layers)
56 | 
57 |     def __call__(self, state):
58 |         h = state
59 |         for layer in self:
60 |             h = self.activation(layer(h))
61 |         return h
62 | 


--------------------------------------------------------------------------------
/chainerrl/experiments/hooks.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import division
 3 | from __future__ import unicode_literals
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from abc import ABCMeta
10 | from abc import abstractmethod
11 | 
12 | from future.utils import with_metaclass
13 | import numpy as np
14 | 
15 | 
16 | class StepHook(with_metaclass(ABCMeta, object)):
17 |     """Hook function that will be called in training.
18 | 
19 |     This class is for clarifying the interface required for Hook functions.
20 |     You don't need to inherit this class to define your own hooks. Any callable
21 |     that accepts (env, agent, step) as arguments can be used as a hook.
22 |     """
23 | 
24 |     @abstractmethod
25 |     def __call__(self, env, agent, step):
26 |         """Call the hook.
27 | 
28 |         Args:
29 |             env: Environment.
30 |             agent: Agent.
31 |             step: Current timestep.
32 |         """
33 |         raise NotImplementedError
34 | 
35 | 
36 | class LinearInterpolationHook(StepHook):
37 |     """Hook that will set a linearly interpolated value.
38 | 
39 |     You can use this hook to decay the learning rate by using a setter function
40 |     as follows:
41 | 
42 |     .. code-block:: python
43 | 
44 |         def lr_setter(env, agent, value):
45 |             agent.optimizer.lr = value
46 | 
47 |         hook = LinearInterpolationHook(10 ** 6, 1e-3, 0, lr_setter)
48 | 
49 | 
50 |     Args:
51 |         total_steps (int): Number of total steps.
52 |         start_value (float): Start value.
53 |         stop_value (float): Stop value.
54 |         setter (callable): (env, agent, value) -> None
55 |     """
56 | 
57 |     def __init__(self, total_steps, start_value, stop_value, setter):
58 |         self.total_steps = total_steps
59 |         self.start_value = start_value
60 |         self.stop_value = stop_value
61 |         self.setter = setter
62 | 
63 |     def __call__(self, env, agent, step):
64 |         value = np.interp(step,
65 |                           [1, self.total_steps],
66 |                           [self.start_value, self.stop_value])
67 |         self.setter(env, agent, value)
68 | 


--------------------------------------------------------------------------------
/chainerrl/agents/al.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import chainer
10 | from chainer import functions as F
11 | 
12 | from chainerrl.agents import dqn
13 | from chainerrl.recurrent import state_kept
14 | 
15 | 
16 | class AL(dqn.DQN):
17 |     """Advantage Learning.
18 | 
19 |     See: http://arxiv.org/abs/1512.04860.
20 | 
21 |     Args:
22 |       alpha (float): Weight of (persistent) advantages. Convergence
23 |         is guaranteed only for alpha in [0, 1).
24 | 
25 |     For other arguments, see DQN.
26 |     """
27 | 
28 |     def __init__(self, *args, **kwargs):
29 |         self.alpha = kwargs.pop('alpha', 0.9)
30 |         super().__init__(*args, **kwargs)
31 | 
32 |     def _compute_y_and_t(self, exp_batch):
33 | 
34 |         batch_state = exp_batch['state']
35 |         batch_size = len(exp_batch['reward'])
36 | 
37 |         qout = self.q_function(batch_state)
38 | 
39 |         batch_actions = exp_batch['action']
40 | 
41 |         batch_q = qout.evaluate_actions(batch_actions)
42 | 
43 |         # Compute target values
44 | 
45 |         with chainer.no_backprop_mode():
46 |             target_qout = self.target_q_function(batch_state)
47 | 
48 |             batch_next_state = exp_batch['next_state']
49 | 
50 |             with state_kept(self.target_q_function):
51 |                 target_next_qout = self.target_q_function(
52 |                     batch_next_state)
53 |             next_q_max = F.reshape(target_next_qout.max, (batch_size,))
54 | 
55 |             batch_rewards = exp_batch['reward']
56 |             batch_terminal = exp_batch['is_state_terminal']
57 | 
58 |             # T Q: Bellman operator
59 |             t_q = batch_rewards + exp_batch['discount'] * \
60 |                 (1.0 - batch_terminal) * next_q_max
61 | 
62 |             # T_AL Q: advantage learning operator
63 |             cur_advantage = F.reshape(
64 |                 target_qout.compute_advantage(batch_actions), (batch_size,))
65 |             tal_q = t_q + self.alpha * cur_advantage
66 | 
67 |         return batch_q, tal_q
68 | 
69 |     def input_initial_batch_to_target_model(self, batch):
70 |         pass
71 | 


--------------------------------------------------------------------------------
/tests/misc_tests/test_copy_param.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | import unittest
 8 | 
 9 | import chainer
10 | from chainer import links as L
11 | import numpy as np
12 | 
13 | from chainerrl.misc import copy_param
14 | 
15 | 
16 | class TestCopyParam(unittest.TestCase):
17 | 
18 |     def test_copy_param(self):
19 |         a = L.Linear(1, 5)
20 |         b = L.Linear(1, 5)
21 | 
22 |         s = chainer.Variable(np.random.rand(1, 1).astype(np.float32))
23 |         a_out = list(a(s).array.ravel())
24 |         b_out = list(b(s).array.ravel())
25 |         self.assertNotEqual(a_out, b_out)
26 | 
27 |         # Copy b's parameters to a
28 |         copy_param.copy_param(a, b)
29 | 
30 |         a_out_new = list(a(s).array.ravel())
31 |         b_out_new = list(b(s).array.ravel())
32 |         self.assertEqual(a_out_new, b_out)
33 |         self.assertEqual(b_out_new, b_out)
34 | 
35 |     def test_copy_param_type_check(self):
36 |         a = L.Linear(None, 5)
37 |         b = L.Linear(1, 5)
38 | 
39 |         with self.assertRaises(TypeError):
40 |             # Copy b's parameters to a, but since `a` parameter is not
41 |             # initialized, it should raise error.
42 |             copy_param.copy_param(a, b)
43 | 
44 |     def test_soft_copy_param(self):
45 |         a = L.Linear(1, 5)
46 |         b = L.Linear(1, 5)
47 | 
48 |         a.W.array[:] = 0.5
49 |         b.W.array[:] = 1
50 | 
51 |         # a = (1 - tau) * a + tau * b
52 |         copy_param.soft_copy_param(target_link=a, source_link=b, tau=0.1)
53 | 
54 |         np.testing.assert_almost_equal(a.W.array, np.full(a.W.shape, 0.55))
55 |         np.testing.assert_almost_equal(b.W.array, np.full(b.W.shape, 1.0))
56 | 
57 |         copy_param.soft_copy_param(target_link=a, source_link=b, tau=0.1)
58 | 
59 |         np.testing.assert_almost_equal(a.W.array, np.full(a.W.shape, 0.595))
60 |         np.testing.assert_almost_equal(b.W.array, np.full(b.W.shape, 1.0))
61 | 
62 |     def test_soft_copy_param_type_check(self):
63 |         a = L.Linear(None, 5)
64 |         b = L.Linear(1, 5)
65 | 
66 |         with self.assertRaises(TypeError):
67 |             copy_param.soft_copy_param(target_link=a, source_link=b, tau=0.1)
68 | 


--------------------------------------------------------------------------------
/chainerrl/misc/draw_computational_graph.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import subprocess
10 | 
11 | import chainer.computational_graph
12 | import chainerrl
13 | 
14 | 
15 | def collect_variables(obj):
16 |     """Collect Variable objects inside a given object.
17 | 
18 |     Args:
19 |         obj (object): Object to collect Variable objects from.
20 |     Returns:
21 |         List of Variable objects.
22 |     """
23 |     variables = []
24 |     if isinstance(obj, chainer.Variable):
25 |         return [obj]
26 |     elif isinstance(obj, chainerrl.action_value.ActionValue):
27 |         return list(obj.params)
28 |     elif isinstance(obj, chainerrl.distribution.Distribution):
29 |         return list(obj.params)
30 |     elif isinstance(obj, (list, tuple)):
31 |         variables = []
32 |         for child in obj:
33 |             variables.extend(collect_variables(child))
34 |         return variables
35 | 
36 | 
37 | def is_graphviz_available():
38 |     return chainerrl.misc.is_return_code_zero(['dot', '-V'])
39 | 
40 | 
41 | def draw_computational_graph(outputs, filepath):
42 |     """Draw a computational graph and write to a given file.
43 | 
44 |     Args:
45 |         outputs (object): Output(s) of the computational graph. It must be
46 |             a Variable, an ActionValue, a Distribution or a list of them.
47 |         filepath (str): Filepath to write a graph without file extention.
48 |             A DOT file will be saved with ".gv" extension added.
49 |             If Graphviz's dot command is available, a PNG file will also be
50 |             saved with ".png" extension added.
51 |     """
52 |     variables = collect_variables(outputs)
53 |     g = chainer.computational_graph.build_computational_graph(variables)
54 |     gv_filepath = filepath + '.gv'
55 |     with open(gv_filepath, 'w') as f:
56 |         # future.builtins.str is required to make sure the content is unicode
57 |         # in both py2 and py3
58 |         f.write(str(g.dump()))
59 |     if is_graphviz_available():
60 |         png_filepath = filepath + '.png'
61 |         subprocess.check_call(
62 |             ['dot', '-Tpng', gv_filepath, '-o', png_filepath])
63 | 


--------------------------------------------------------------------------------
/chainerrl/explorers/additive_ou.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from logging import getLogger
10 | 
11 | import numpy as np
12 | 
13 | from chainerrl import explorer
14 | 
15 | 
16 | class AdditiveOU(explorer.Explorer):
17 |     """Additive Ornstein-Uhlenbeck process.
18 | 
19 |     Used in https://arxiv.org/abs/1509.02971 for exploration.
20 | 
21 |     Args:
22 |         mu (float): Mean of the OU process
23 |         theta (float): Friction to pull towards the mean
24 |         sigma (float or ndarray): Scale of noise
25 |         start_with_mu (bool): Start the process without noise
26 |     """
27 | 
28 |     def __init__(self, mu=0.0, theta=0.15, sigma=0.3, start_with_mu=False,
29 |                  logger=getLogger(__name__)):
30 |         self.mu = mu
31 |         self.theta = theta
32 |         self.sigma = sigma
33 |         self.start_with_mu = start_with_mu
34 |         self.logger = logger
35 |         self.ou_state = None
36 | 
37 |     def evolve(self):
38 |         # dx = theta (mu - x) + sigma dW
39 |         # for a Wiener process W
40 |         noise = np.random.normal(size=self.ou_state.shape, loc=0,
41 |                                  scale=self.sigma)
42 |         self.ou_state += self.theta * (self.mu - self.ou_state) + noise
43 | 
44 |     def select_action(self, t, greedy_action_func, action_value=None):
45 |         a = greedy_action_func()
46 |         if self.ou_state is None:
47 |             if self.start_with_mu:
48 |                 self.ou_state = np.full(a.shape, self.mu, dtype=np.float32)
49 |             else:
50 |                 sigma_stable = (self.sigma /
51 |                                 np.sqrt(2 * self.theta - self.theta ** 2))
52 |                 self.ou_state = np.random.normal(
53 |                     size=a.shape,
54 |                     loc=self.mu, scale=sigma_stable).astype(np.float32)
55 |         else:
56 |             self.evolve()
57 |         noise = self.ou_state
58 |         self.logger.debug('t:%s noise:%s', t, noise)
59 |         return a + noise
60 | 
61 |     def __repr__(self):
62 |         return 'AdditiveOU(mu={}, theta={}, sigma={})'.format(
63 |             self.mu, self.theta, self.sigma)
64 | 


--------------------------------------------------------------------------------
/chainerrl/misc/env_modifiers.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | from builtins import *  # NOQA
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | def make_rendered(env, *render_args, **render_kwargs):
13 |     base_step = env.step
14 |     base_close = env.close
15 | 
16 |     def step(action):
17 |         ret = base_step(action)
18 |         env.render(*render_args, **render_kwargs)
19 |         return ret
20 | 
21 |     def close():
22 |         env.render(*render_args, close=True, **render_kwargs)
23 |         base_close()
24 | 
25 |     env.step = step
26 |     env.close = close
27 | 
28 | 
29 | def make_timestep_limited(env, timestep_limit):
30 |     t = [1]
31 |     old_step = env.step
32 |     old_reset = env.reset
33 | 
34 |     def step(action):
35 |         observation, reward, done, info = old_step(action)
36 |         if t[0] >= timestep_limit:
37 |             done = True
38 |         t[0] += 1
39 |         return observation, reward, done, info
40 | 
41 |     def reset():
42 |         t[0] = 1
43 |         return old_reset()
44 | 
45 |     env.step = step
46 |     env.reset = reset
47 | 
48 | 
49 | def make_action_filtered(env, action_filter):
50 |     old_step = env.step
51 | 
52 |     def step(action):
53 |         return old_step(action_filter(action))
54 | 
55 |     env.step = step
56 | 
57 | 
58 | def make_reward_filtered(env, reward_filter):
59 |     old_step = env.step
60 | 
61 |     def step(action):
62 |         observation, reward, done, info = old_step(action)
63 |         reward = reward_filter(reward)
64 |         return observation, reward, done, info
65 | 
66 |     env.step = step
67 | 
68 | 
69 | def make_reward_clipped(env, low, high):
70 |     make_reward_filtered(env, lambda x: np.clip(x, low, high))
71 | 
72 | 
73 | def make_action_repeated(env, n_times):
74 |     """Repeat received actions.
75 | 
76 |     - Rewards are accumulated while repeating.
77 |     - Only latest observations are returned.
78 |     """
79 |     old_step = env.step
80 | 
81 |     def step(action):
82 |         r_total = 0
83 |         for _ in range(n_times):
84 |             obs, r, done, info = old_step(action)
85 |             r_total += r
86 |             if done:
87 |                 break
88 |         return obs, r_total, done, info
89 | 
90 |     env.step = step
91 | 


--------------------------------------------------------------------------------
/tests/wrappers_tests/test_render.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import unittest
10 | 
11 | from chainer import testing
12 | import mock
13 | 
14 | import chainerrl
15 | 
16 | 
17 | @testing.parameterize(*testing.product({
18 |     'render_kwargs': [
19 |         {},
20 |         {'mode': 'human'},
21 |         {'mode': 'rgb_array'},
22 |     ]
23 | }))
24 | class TestRender(unittest.TestCase):
25 | 
26 |     def test(self):
27 |         orig_env = mock.Mock()
28 |         # Reaches the terminal state after five actions
29 |         orig_env.reset.side_effect = [
30 |             ('state', 0),
31 |             ('state', 3),
32 |         ]
33 |         orig_env.step.side_effect = [
34 |             (('state', 1), 0, False, {}),
35 |             (('state', 2), 1, True, {}),
36 |         ]
37 |         env = chainerrl.wrappers.Render(orig_env, **self.render_kwargs)
38 | 
39 |         # Not called env.render yet
40 |         self.assertEqual(orig_env.render.call_count, 0)
41 | 
42 |         obs = env.reset()
43 |         self.assertEqual(obs, ('state', 0))
44 | 
45 |         # Called once
46 |         self.assertEqual(orig_env.render.call_count, 1)
47 | 
48 |         obs, reward, done, info = env.step(0)
49 |         self.assertEqual(obs, ('state', 1))
50 |         self.assertEqual(reward, 0)
51 |         self.assertEqual(done, False)
52 |         self.assertEqual(info, {})
53 | 
54 |         # Called twice
55 |         self.assertEqual(orig_env.render.call_count, 2)
56 | 
57 |         obs, reward, done, info = env.step(0)
58 |         self.assertEqual(obs, ('state', 2))
59 |         self.assertEqual(reward, 1)
60 |         self.assertEqual(done, True)
61 |         self.assertEqual(info, {})
62 | 
63 |         # Called thrice
64 |         self.assertEqual(orig_env.render.call_count, 3)
65 | 
66 |         obs = env.reset()
67 |         self.assertEqual(obs, ('state', 3))
68 | 
69 |         # Called four times
70 |         self.assertEqual(orig_env.render.call_count, 4)
71 | 
72 |         # All the calls should receive correct kwargs
73 |         for call in orig_env.render.call_args_list:
74 |             args, kwargs = call
75 |             self.assertEqual(len(args), 0)
76 |             self.assertEqual(kwargs, self.render_kwargs)
77 | 


--------------------------------------------------------------------------------
/tests/agents_tests/test_dpp.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from chainer import testing
10 | 
11 | import basetest_dqn_like as base
12 | from basetest_training import _TestBatchTrainingMixin
13 | from chainerrl.agents.dpp import DPP
14 | from chainerrl.agents.dpp import DPPGreedy
15 | from chainerrl.agents.dpp import DPPL
16 | 
17 | 
18 | def parse_dpp_agent(dpp_type):
19 |     return {'DPP': DPP,
20 |             'DPPL': DPPL,
21 |             'DPPGreedy': DPPGreedy}[dpp_type]
22 | 
23 | 
24 | @testing.parameterize(
25 |     *testing.product({
26 |         'dpp_type': ['DPP', 'DPPL', 'DPPGreedy'],
27 |     })
28 | )
29 | class TestDPPOnDiscreteABC(
30 |         _TestBatchTrainingMixin,
31 |         base._TestDQNOnDiscreteABC):
32 | 
33 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
34 |         agent_class = parse_dpp_agent(self.dpp_type)
35 |         return agent_class(
36 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
37 |             replay_start_size=100, target_update_interval=100)
38 | 
39 | 
40 | # DPP and DPPL don't support continuous action spaces
41 | @testing.parameterize(
42 |     *testing.product({
43 |         'dpp_type': ['DPPGreedy'],
44 |     })
45 | )
46 | class TestDPPOnContinuousABC(
47 |         _TestBatchTrainingMixin,
48 |         base._TestDQNOnContinuousABC):
49 | 
50 |     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
51 |         agent_class = parse_dpp_agent(self.dpp_type)
52 |         return agent_class(
53 |             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
54 |             replay_start_size=100, target_update_interval=100)
55 | 
56 | 
57 | # Currently DPP doesn't work with recurrent models
58 | # TODO(fujita) make it work
59 | 
60 | # @testing.parameterize(
61 | #     *testing.product({
62 | #         'dpp_type': ['DPP', 'DPPL', 'DPPGreedy'],
63 | #     }),
64 | # )
65 | # class TestDPPOnDiscretePOABC(base._TestDQNOnDiscretePOABC):
66 | #
67 | #     def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu):
68 | #         agent_class = parse_dpp_agent(self.dpp_type)
69 | #         return agent_class(
70 | #             q_func, opt, rbuf, gpu=gpu, gamma=0.9, explorer=explorer,
71 | #             replay_start_size=100, target_update_interval=100,
72 | #             episodic_update=True)
73 | 


--------------------------------------------------------------------------------
/chainerrl/agents/pal.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import chainer
10 | from chainer import functions as F
11 | 
12 | from chainerrl.agents import dqn
13 | from chainerrl.recurrent import state_kept
14 | 
15 | 
16 | class PAL(dqn.DQN):
17 |     """Persistent Advantage Learning.
18 | 
19 |     See: http://arxiv.org/abs/1512.04860.
20 | 
21 |     Args:
22 |       alpha (float): Weight of (persistent) advantages. Convergence
23 |         is guaranteed only for alpha in [0, 1).
24 | 
25 |     For other arguments, see DQN.
26 |     """
27 | 
28 |     def __init__(self, *args, **kwargs):
29 |         self.alpha = kwargs.pop('alpha', 0.9)
30 |         super().__init__(*args, **kwargs)
31 | 
32 |     def _compute_y_and_t(self, exp_batch):
33 | 
34 |         batch_state = exp_batch['state']
35 |         batch_size = len(exp_batch['reward'])
36 | 
37 |         qout = self.q_function(batch_state)
38 | 
39 |         batch_actions = exp_batch['action']
40 |         batch_q = qout.evaluate_actions(batch_actions)
41 | 
42 |         # Compute target values
43 |         with chainer.no_backprop_mode():
44 | 
45 |             target_qout = self.target_q_function(batch_state)
46 | 
47 |             batch_next_state = exp_batch['next_state']
48 | 
49 |             with state_kept(self.target_q_function):
50 |                 target_next_qout = self.target_q_function(
51 |                     batch_next_state)
52 |             next_q_max = F.reshape(target_next_qout.max, (batch_size,))
53 | 
54 |             batch_rewards = exp_batch['reward']
55 |             batch_terminal = exp_batch['is_state_terminal']
56 | 
57 |             # T Q: Bellman operator
58 |             t_q = batch_rewards + exp_batch['discount'] * \
59 |                 (1.0 - batch_terminal) * next_q_max
60 | 
61 |             # T_PAL Q: persistent advantage learning operator
62 |             cur_advantage = F.reshape(
63 |                 target_qout.compute_advantage(batch_actions), (batch_size,))
64 |             next_advantage = F.reshape(
65 |                 target_next_qout.compute_advantage(batch_actions),
66 |                 (batch_size,))
67 |             tpal_q = t_q + self.alpha * \
68 |                 F.maximum(cur_advantage, next_advantage)
69 | 
70 |         return batch_q, tpal_q
71 | 
72 |     def input_initial_batch_to_target_model(self, batch):
73 |         pass
74 | 


--------------------------------------------------------------------------------
/chainerrl/__init__.py:
--------------------------------------------------------------------------------
 1 | from chainerrl import action_value  # NOQA
 2 | from chainerrl import agent  # NOQA
 3 | from chainerrl import agents  # NOQA
 4 | from chainerrl import distribution  # NOQA
 5 | from chainerrl import env  # NOQA
 6 | from chainerrl import envs  # NOQA
 7 | from chainerrl import experiments  # NOQA
 8 | from chainerrl import explorer  # NOQA
 9 | from chainerrl import explorers  # NOQA
10 | from chainerrl import functions  # NOQA
11 | from chainerrl import links  # NOQA
12 | from chainerrl import misc  # NOQA
13 | from chainerrl import optimizers  # NOQA
14 | from chainerrl import policies  # NOQA
15 | from chainerrl import policy  # NOQA
16 | from chainerrl import q_function  # NOQA
17 | from chainerrl import q_functions  # NOQA
18 | from chainerrl import recurrent  # NOQA
19 | from chainerrl import replay_buffer  # NOQA
20 | from chainerrl import v_function  # NOQA
21 | from chainerrl import v_functions  # NOQA
22 | from chainerrl import wrappers  # NOQA
23 | 
24 | # For backward compatibility while avoiding circular import
25 | policy.SoftmaxPolicy = policies.SoftmaxPolicy
26 | policy.FCSoftmaxPolicy = policies.FCSoftmaxPolicy
27 | policy.ContinuousDeterministicPolicy = policies.ContinuousDeterministicPolicy
28 | policy.FCDeterministicPolicy = policies.FCDeterministicPolicy
29 | policy.FCBNDeterministicPolicy = policies.FCBNDeterministicPolicy
30 | policy.FCLSTMDeterministicPolicy = policies.FCLSTMDeterministicPolicy
31 | policy.FCGaussianPolicy = policies.FCGaussianPolicy
32 | policy.MellowmaxPolicy = policies.MellowmaxPolicy
33 | 
34 | q_function.DuelingDQN = q_functions.DuelingDQN
35 | q_function.SingleModelStateActionQFunction = \
36 |     q_functions.SingleModelStateActionQFunction
37 | q_function.FCSAQFunction = q_functions.FCSAQFunction
38 | q_function.FCLSTMSAQFunction = q_functions.FCLSTMSAQFunction
39 | q_function.FCBNSAQFunction = q_functions.FCBNSAQFunction
40 | q_function.FCBNLateActionSAQFunction = q_functions.FCBNLateActionSAQFunction
41 | q_function.FCLateActionSAQFunction = q_functions.FCLateActionSAQFunction
42 | q_function.SingleModelStateActionQFunction = \
43 |     q_functions.SingleModelStateActionQFunction
44 | q_function.FCStateQFunctionWithDiscreteAction = \
45 |     q_functions.FCStateQFunctionWithDiscreteAction
46 | q_function.FCLSTMStateQFunction = q_functions.FCLSTMStateQFunction
47 | q_function.FCQuadraticStateQFunction = q_functions.FCQuadraticStateQFunction
48 | q_function.FCBNQuadraticStateQFunction = \
49 |     q_functions.FCBNQuadraticStateQFunction
50 | 
51 | v_function.SingleModelVFunction = v_functions.SingleModelVFunction
52 | v_function.FCVFunction = v_functions.FCVFunction
53 | 


--------------------------------------------------------------------------------
/chainerrl/links/noisy_linear.py:
--------------------------------------------------------------------------------
 1 | import chainer
 2 | import chainer.functions as F
 3 | from chainer.initializers import LeCunUniform
 4 | import chainer.links as L
 5 | import numpy
 6 | 
 7 | from chainerrl.initializers import VarianceScalingConstant
 8 | 
 9 | 
10 | class FactorizedNoisyLinear(chainer.Chain):
11 |     """Linear layer in Factorized Noisy Network
12 | 
13 |     Args:
14 |         mu_link (L.Linear): Linear link that computes mean of output.
15 |         sigma_scale (float): The hyperparameter sigma_0 in the original paper.
16 |             Scaling factor of the initial weights of noise-scaling parameters.
17 |     """
18 | 
19 |     def __init__(self, mu_link, sigma_scale=0.4):
20 |         super(FactorizedNoisyLinear, self).__init__()
21 |         self.out_size = mu_link.out_size
22 |         self.nobias = not ('/b' in [name for name, _ in mu_link.namedparams()])
23 | 
24 |         W_data = mu_link.W.array
25 |         in_size = None if W_data is None else W_data.shape[1]
26 |         device_id = mu_link._device_id
27 | 
28 |         with self.init_scope():
29 |             self.mu = L.Linear(in_size, self.out_size, self.nobias,
30 |                                initialW=LeCunUniform(1 / numpy.sqrt(3)))
31 | 
32 |             self.sigma = L.Linear(in_size, self.out_size, self.nobias,
33 |                                   initialW=VarianceScalingConstant(
34 |                                       sigma_scale),
35 |                                   initial_bias=VarianceScalingConstant(
36 |                                       sigma_scale))
37 | 
38 |         if device_id is not None:
39 |             self.to_gpu(device_id)
40 | 
41 |     def _eps(self, shape, dtype):
42 |         xp = self.xp
43 |         r = xp.random.standard_normal(shape).astype(dtype)
44 | 
45 |         # apply the function f
46 |         return xp.copysign(xp.sqrt(xp.abs(r)), r)
47 | 
48 |     def __call__(self, x):
49 |         if self.mu.W.array is None:
50 |             self.mu.W.initialize((self.out_size, numpy.prod(x.shape[1:])))
51 |         if self.sigma.W.array is None:
52 |             self.sigma.W.initialize((self.out_size, numpy.prod(x.shape[1:])))
53 | 
54 |         # use info of sigma.W to avoid strange error messages
55 |         dtype = self.sigma.W.dtype
56 |         out_size, in_size = self.sigma.W.shape
57 | 
58 |         eps_x = self._eps(in_size, dtype)
59 |         eps_y = self._eps(out_size, dtype)
60 |         W = self.mu.W + self.sigma.W * self.xp.outer(eps_y, eps_x)
61 |         if self.nobias:
62 |             return F.linear(x, W)
63 |         else:
64 |             b = self.mu.b + self.sigma.b * eps_y
65 |             return F.linear(x, W, b)
66 | 


--------------------------------------------------------------------------------
/chainerrl/functions/mellowmax.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | 
 6 | import chainer
 7 | from chainer import functions as F
 8 | import numpy as np
 9 | import scipy.optimize
10 | 
11 | 
12 | def mellowmax(values, omega=1., axis=1):
13 |     """Mellowmax function.
14 | 
15 |     This is a kind of softmax function that is, unlike the Boltzmann softmax,
16 |     non-expansion.
17 | 
18 |     See: http://arxiv.org/abs/1612.05628
19 | 
20 |     Args:
21 |         values (Variable or ndarray):
22 |             Input values. Mellowmax is taken along the second axis.
23 |         omega (float):
24 |             Parameter of mellowmax.
25 |         axis (int):
26 |             Axis along which mellowmax is taken.
27 |     Returns:
28 |         outputs (Variable)
29 |     """
30 |     n = values.shape[axis]
31 |     return (F.logsumexp(omega * values, axis=axis) - np.log(n)) / omega
32 | 
33 | 
34 | def maximum_entropy_mellowmax(values, omega=1., beta_min=-10, beta_max=10):
35 |     """Maximum entropy mellowmax policy function.
36 | 
37 |     This function provides a categorical distribution whose expectation matches
38 |     the one of mellowmax function while maximizing its entropy.
39 | 
40 |     See: http://arxiv.org/abs/1612.05628
41 | 
42 |     Args:
43 |         values (Variable or ndarray):
44 |             Input values. Mellowmax is taken along the second axis.
45 |         omega (float):
46 |             Parameter of mellowmax.
47 |         beta_min (float):
48 |             Minimum value of beta, used in Brent's algorithm.
49 |         beta_max (float):
50 |             Maximum value of beta, used in Brent's algorithm.
51 |     Returns:
52 |         outputs (Variable)
53 |     """
54 |     xp = chainer.cuda.get_array_module(values)
55 |     mm = mellowmax(values, axis=1)
56 | 
57 |     # Advantage: Q - mellowmax(Q)
58 |     batch_adv = values - F.broadcast_to(F.expand_dims(mm, 1), values.shape)
59 |     # Move data to CPU because we use Brent's algorithm in scipy
60 |     batch_adv = chainer.cuda.to_cpu(batch_adv.array)
61 |     batch_beta = np.empty(mm.shape, dtype=np.float32)
62 | 
63 |     # Beta is computed as the root of this function
64 |     def f(y, adv):
65 |         return np.sum(np.exp(y * adv) * adv)
66 | 
67 |     for idx in np.ndindex(mm.shape):
68 |         idx_full = idx[:1] + (slice(None),) + idx[1:]
69 |         adv = batch_adv[idx_full]
70 |         try:
71 |             beta = scipy.optimize.brentq(
72 |                 f, a=beta_min, b=beta_max, args=(adv,))
73 |         except ValueError:
74 |             beta = 0
75 |         batch_beta[idx] = beta
76 | 
77 |     return F.softmax(xp.expand_dims(xp.asarray(batch_beta), 1) * values)
78 | 


--------------------------------------------------------------------------------
/tests/functions_tests/test_lower_triangular_matrix.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | import unittest
 8 | 
 9 | import numpy
10 | 
11 | import chainer
12 | from chainer import cuda
13 | from chainer import gradient_check
14 | from chainer import testing
15 | from chainer.testing import attr
16 | 
17 | from chainerrl.functions.lower_triangular_matrix import lower_triangular_matrix
18 | from chainerrl.functions.lower_triangular_matrix import LowerTriangularMatrix
19 | 
20 | 
21 | @testing.parameterize(
22 |     {'n': 1},
23 |     {'n': 2},
24 |     {'n': 3},
25 |     {'n': 4},
26 |     {'n': 5},
27 | )
28 | class TestLowerTriangularMatrix(unittest.TestCase):
29 | 
30 |     def setUp(self):
31 |         self.batch_size = 5
32 |         self.diag = numpy.random.uniform(
33 |             0.1, 1, (self.batch_size, self.n)).astype(numpy.float32)
34 |         non_diag_size = self.n * (self.n - 1) // 2
35 |         self.non_diag = numpy.random.uniform(
36 |             -1, 1, (self.batch_size, non_diag_size)).astype(numpy.float32)
37 |         self.gy = numpy.random.uniform(
38 |             -1, 1, (self.batch_size, self.n, self.n)).astype(numpy.float32)
39 | 
40 |     def check_forward(self, diag_data, non_diag_data):
41 |         diag = chainer.Variable(diag_data)
42 |         non_diag = chainer.Variable(non_diag_data)
43 |         y = lower_triangular_matrix(diag, non_diag)
44 | 
45 |         correct_y = numpy.zeros(
46 |             (self.batch_size, self.n, self.n), dtype=numpy.float32)
47 | 
48 |         tril_rows, tril_cols = numpy.tril_indices(self.n, -1)
49 |         correct_y[:, tril_rows, tril_cols] = cuda.to_cpu(non_diag_data)
50 | 
51 |         diag_rows, diag_cols = numpy.diag_indices(self.n)
52 |         correct_y[:, diag_rows, diag_cols] = cuda.to_cpu(diag_data)
53 | 
54 |         gradient_check.assert_allclose(correct_y, cuda.to_cpu(y.array))
55 | 
56 |     def test_forward_cpu(self):
57 |         self.check_forward(self.diag, self.non_diag)
58 | 
59 |     @attr.gpu
60 |     def test_forward_gpu(self):
61 |         self.check_forward(cuda.to_gpu(self.diag), cuda.to_gpu(self.non_diag))
62 | 
63 |     def check_backward(self, x_data, y_grad):
64 |         gradient_check.check_backward(
65 |             LowerTriangularMatrix(),
66 |             x_data, y_grad, eps=1e-2, rtol=1e-2)
67 | 
68 |     def test_backward_cpu(self):
69 |         self.check_backward((self.diag, self.non_diag), self.gy)
70 | 
71 |     @attr.gpu
72 |     def test_backward_gpu(self):
73 |         self.check_backward((cuda.to_gpu(self.diag), cuda.to_gpu(
74 |             self.non_diag)), cuda.to_gpu(self.gy))
75 | 
76 | 
77 | testing.run_module(__name__, __file__)
78 | 


--------------------------------------------------------------------------------
/chainerrl/optimizers/rmsprop_async.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from chainer import cuda
10 | from chainer import optimizer
11 | import numpy
12 | 
13 | 
14 | _default_hyperparam = optimizer.Hyperparameter()
15 | _default_hyperparam.lr = 0.01
16 | _default_hyperparam.alpha = 0.99
17 | _default_hyperparam.eps = 1e-8
18 | 
19 | 
20 | class RMSpropAsyncRule(optimizer.UpdateRule):
21 | 
22 |     def __init__(self, parent_hyperparam=None, lr=None, alpha=None, eps=None):
23 |         super(RMSpropAsyncRule, self).__init__(
24 |             parent_hyperparam or _default_hyperparam)
25 |         if lr is not None:
26 |             self.hyperparam.lr = lr
27 |         if alpha is not None:
28 |             self.hyperparam.alpha = alpha
29 |         if eps is not None:
30 |             self.hyperparam.eps = eps
31 | 
32 |     def init_state(self, param):
33 |         xp = cuda.get_array_module(param.array)
34 |         with cuda.get_device_from_array(param.array):
35 |             self.state['ms'] = xp.zeros_like(param.array)
36 | 
37 |     def update_core_cpu(self, param):
38 |         grad = param.grad
39 |         if grad is None:
40 |             return
41 |         hp = self.hyperparam
42 |         ms = self.state['ms']
43 | 
44 |         ms *= hp.alpha
45 |         ms += (1 - hp.alpha) * grad * grad
46 |         param.array -= hp.lr * grad / numpy.sqrt(ms + hp.eps)
47 | 
48 |     def update_core_gpu(self, param):
49 |         grad = param.grad
50 |         if grad is None:
51 |             return
52 |         cuda.elementwise(
53 |             'T grad, T lr, T alpha, T eps',
54 |             'T param, T ms',
55 |             '''ms = alpha * ms + (1 - alpha) * grad * grad;
56 |                param -= lr * grad / sqrt(ms + eps);''',
57 |             'rmsprop')(grad, self.hyperparam.lr, self.hyperparam.alpha,
58 |                        self.hyperparam.eps, param.array, self.state['ms'])
59 | 
60 | 
61 | class RMSpropAsync(optimizer.GradientMethod):
62 | 
63 |     """RMSprop for asynchronous methods.
64 | 
65 |     The only difference from chainer.optimizers.RMSprop in that the epsilon is
66 |     outside the square root.
67 |     """
68 | 
69 |     def __init__(self, lr=_default_hyperparam.lr,
70 |                  alpha=_default_hyperparam.alpha, eps=_default_hyperparam.eps):
71 |         super(RMSpropAsync, self).__init__()
72 |         self.hyperparam.lr = lr
73 |         self.hyperparam.alpha = alpha
74 |         self.hyperparam.eps = eps
75 | 
76 |     lr = optimizer.HyperparameterProxy('lr')
77 |     alpha = optimizer.HyperparameterProxy('alpha')
78 |     eps = optimizer.HyperparameterProxy('eps')
79 | 
80 |     def create_update_rule(self):
81 |         return RMSpropAsyncRule(self.hyperparam)
82 | 


--------------------------------------------------------------------------------
/tests/functions_tests/test_invert_gradients.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | 
 6 | import unittest
 7 | 
 8 | import chainer
 9 | from chainer import cuda
10 | from chainer import functions
11 | from chainer import testing
12 | from chainer.testing import attr
13 | from chainer.testing import condition
14 | import numpy
15 | 
16 | from chainerrl.functions.invert_gradients import invert_gradients
17 | 
18 | 
19 | @testing.parameterize(*testing.product({
20 |     'shape': [(), (1, 1), (2, 3), (2, 3, 4), (2, 3, 4, 5)],
21 |     'dtype': [numpy.float32],
22 | }))
23 | class TestInvertGradients(unittest.TestCase):
24 | 
25 |     def setUp(self):
26 |         self.x = numpy.random.uniform(-1, 1, self.shape).astype(self.dtype)
27 | 
28 |     def check_forward(self, x_data):
29 | 
30 |         # In chainer, update will be like x.array -= lr * x.grad,
31 |         # which means negative gradients will increase values.
32 | 
33 |         # Not exceeding
34 |         range_max = x_data + 0.1
35 |         range_min = x_data - 0.1
36 |         x = chainer.Variable(x_data)
37 |         y = invert_gradients(x, range_min=range_min, range_max=range_max)
38 | 
39 |         loss = functions.sum(y)  # Minimize y
40 |         loss.backward()
41 |         self.assertTrue((x.grad > 0).all())  # Decrease x
42 |         x.cleargrad()
43 | 
44 |         loss = -functions.sum(y)  # Maximize y
45 |         loss.backward()
46 |         self.assertTrue((x.grad < 0).all())  # Increase x
47 |         x.cleargrad()
48 | 
49 |         # Exceeding range_max
50 |         range_max = x_data - 0.1
51 |         range_min = x_data - 0.2
52 |         y = invert_gradients(x, range_min=range_min, range_max=range_max)
53 | 
54 |         loss = functions.sum(y)  # Minimize y
55 |         loss.backward()
56 |         self.assertTrue((x.grad > 0).all())  # Decrease x
57 |         x.cleargrad()
58 | 
59 |         loss = -functions.sum(y)  # Maximize y
60 |         loss.backward()
61 |         self.assertTrue((x.grad > 0).all())  # Decrease x
62 |         x.cleargrad()
63 | 
64 |         # Exceeding range_min
65 |         range_max = x_data + 0.2
66 |         range_min = x_data + 0.1
67 |         y = invert_gradients(x, range_min=range_min, range_max=range_max)
68 | 
69 |         loss = functions.sum(y)  # Minimize y
70 |         loss.backward()
71 |         self.assertTrue((x.grad < 0).all())  # Increase x
72 |         x.cleargrad()
73 | 
74 |         loss = -functions.sum(y)  # Maximize y
75 |         loss.backward()
76 |         self.assertTrue((x.grad < 0).all())  # Increase x
77 |         x.cleargrad()
78 | 
79 |     @condition.retry(3)
80 |     def test_forward_cpu(self):
81 |         self.check_forward(self.x)
82 | 
83 |     @attr.gpu
84 |     @condition.retry(3)
85 |     def test_forward_gpu(self):
86 |         self.check_forward(cuda.to_gpu(self.x))
87 | 
88 | 
89 | testing.run_module(__name__, __file__)
90 | 


--------------------------------------------------------------------------------
/tests/misc_tests/test_collections.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import collections
10 | import unittest
11 | 
12 | from chainer import testing
13 | 
14 | from chainerrl.misc.collections import RandomAccessQueue
15 | 
16 | 
17 | @testing.parameterize(*(
18 |     testing.product({
19 |         'maxlen': [1, 10, None],
20 |         'init_seq': [None, [], range(5)],
21 |     })
22 | ))
23 | class TestRandomAccessQueue(unittest.TestCase):
24 |     def setUp(self):
25 |         if self.init_seq:
26 |             self.y_queue = RandomAccessQueue(self.init_seq, maxlen=self.maxlen)
27 |             self.t_queue = collections.deque(self.init_seq, maxlen=self.maxlen)
28 |         else:
29 |             self.y_queue = RandomAccessQueue(maxlen=self.maxlen)
30 |             self.t_queue = collections.deque(maxlen=self.maxlen)
31 | 
32 |     def test1(self):
33 |         self.check_all()
34 | 
35 |         self.check_popleft()
36 |         self.do_append(10)
37 |         self.check_all()
38 | 
39 |         self.check_popleft()
40 |         self.check_popleft()
41 |         self.do_append(11)
42 |         self.check_all()
43 | 
44 |         # test negative indices
45 |         n = len(self.t_queue)
46 |         for i in range(-n, 0):
47 |             self.check_getitem(i)
48 | 
49 |         for k in range(4):
50 |             self.do_extend(range(k))
51 |             self.check_all()
52 | 
53 |         for k in range(4):
54 |             self.check_popleft()
55 |             self.do_extend(range(k))
56 |             self.check_all()
57 | 
58 |         for k in range(10):
59 |             self.do_append(20 + k)
60 |             self.check_popleft()
61 |             self.check_popleft()
62 |             self.check_all()
63 | 
64 |         for _ in range(100):
65 |             self.check_popleft()
66 | 
67 |     def check_all(self):
68 |         self.check_len()
69 |         n = len(self.t_queue)
70 |         for i in range(n):
71 |             self.check_getitem(i)
72 | 
73 |     def check_len(self):
74 |         self.assertEqual(len(self.y_queue), len(self.t_queue))
75 | 
76 |     def check_getitem(self, i):
77 |         self.assertEqual(self.y_queue[i], self.t_queue[i])
78 | 
79 |     def do_setitem(self, i, x):
80 |         self.y_queue[i] = x
81 |         self.t_queue[i] = x
82 | 
83 |     def do_append(self, x):
84 |         self.y_queue.append(x)
85 |         self.t_queue.append(x)
86 | 
87 |     def do_extend(self, xs):
88 |         self.y_queue.extend(xs)
89 |         self.t_queue.extend(xs)
90 | 
91 |     def check_popleft(self):
92 |         try:
93 |             t = self.t_queue.popleft()
94 |         except IndexError:
95 |             with self.assertRaises(IndexError):
96 |                 self.y_queue.popleft()
97 |         else:
98 |             self.assertEqual(self.y_queue.popleft(), t)
99 | 


--------------------------------------------------------------------------------
/tests/wrappers_tests/test_randomize_action.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import unittest
10 | 
11 | from chainer import testing
12 | from chainer.testing import condition
13 | import gym
14 | import gym.spaces
15 | 
16 | import chainerrl
17 | 
18 | 
19 | class ActionRecordingEnv(gym.Env):
20 | 
21 |     observation_space = gym.spaces.Box(low=-1, high=1, shape=(1,))
22 |     action_space = gym.spaces.Discrete(3)
23 | 
24 |     def __init__(self):
25 |         self.past_actions = []
26 | 
27 |     def reset(self):
28 |         return self.observation_space.sample()
29 | 
30 |     def step(self, action):
31 |         self.past_actions.append(action)
32 |         return self.observation_space.sample(), 0, False, {}
33 | 
34 | 
35 | @testing.parameterize(*testing.product({
36 |     'random_fraction': [0, 0.3, 0.6, 1],
37 | }))
38 | class TestRandomizeAction(unittest.TestCase):
39 | 
40 |     @condition.retry(3)
41 |     def test_action_ratio(self):
42 |         random_fraction = self.random_fraction
43 |         env = ActionRecordingEnv()
44 |         env = chainerrl.wrappers.RandomizeAction(
45 |             env, random_fraction=random_fraction)
46 |         env.reset()
47 |         n = 1000
48 |         delta = 0.05
49 |         for _ in range(n):
50 |             # Always send action 0
51 |             env.step(0)
52 |         # Ratio of selected actions should be:
53 |         #   0: (1 - random_fraction) + random_fraction/3
54 |         #   1: random_fraction/3
55 |         #   2: random_fraction/3
56 |         self.assertAlmostEqual(
57 |             env.env.past_actions.count(0) / n,
58 |             (1 - random_fraction) + random_fraction / 3, delta=delta)
59 |         self.assertAlmostEqual(
60 |             env.env.past_actions.count(1) / n,
61 |             random_fraction / 3, delta=delta)
62 |         self.assertAlmostEqual(
63 |             env.env.past_actions.count(2) / n,
64 |             random_fraction / 3, delta=delta)
65 | 
66 |     @condition.retry(3)
67 |     def test_seed(self):
68 | 
69 |         def get_actions(seed):
70 |             random_fraction = self.random_fraction
71 |             env = ActionRecordingEnv()
72 |             env = chainerrl.wrappers.RandomizeAction(
73 |                 env, random_fraction=random_fraction)
74 |             env.seed(seed)
75 |             for _ in range(1000):
76 |                 # Always send action 0
77 |                 env.step(0)
78 |             return env.env.past_actions
79 | 
80 |         a_seed0 = get_actions(0)
81 |         a_seed1 = get_actions(1)
82 |         b_seed0 = get_actions(0)
83 |         b_seed1 = get_actions(1)
84 | 
85 |         self.assertEqual(a_seed0, b_seed0)
86 |         self.assertEqual(a_seed1, b_seed1)
87 |         if self.random_fraction > 0:
88 |             self.assertNotEqual(a_seed0, a_seed1)
89 | 


--------------------------------------------------------------------------------
/chainerrl/misc/copy_param.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from chainer import links as L
10 | 
11 | 
12 | def copy_param(target_link, source_link):
13 |     """Copy parameters of a link to another link."""
14 |     target_params = dict(target_link.namedparams())
15 |     for param_name, param in source_link.namedparams():
16 |         if target_params[param_name].array is None:
17 |             raise TypeError(
18 |                 'target_link parameter {} is None. Maybe the model params are '
19 |                 'not initialized.\nPlease try to forward dummy input '
20 |                 'beforehand to determine parameter shape of the model.'.format(
21 |                     param_name))
22 |         target_params[param_name].array[:] = param.array
23 | 
24 |     # Copy Batch Normalization's statistics
25 |     target_links = dict(target_link.namedlinks())
26 |     for link_name, link in source_link.namedlinks():
27 |         if isinstance(link, L.BatchNormalization):
28 |             target_bn = target_links[link_name]
29 |             target_bn.avg_mean[:] = link.avg_mean
30 |             target_bn.avg_var[:] = link.avg_var
31 | 
32 | 
33 | def soft_copy_param(target_link, source_link, tau):
34 |     """Soft-copy parameters of a link to another link."""
35 |     target_params = dict(target_link.namedparams())
36 |     for param_name, param in source_link.namedparams():
37 |         if target_params[param_name].array is None:
38 |             raise TypeError(
39 |                 'target_link parameter {} is None. Maybe the model params are '
40 |                 'not initialized.\nPlease try to forward dummy input '
41 |                 'beforehand to determine parameter shape of the model.'.format(
42 |                     param_name))
43 |         target_params[param_name].array[:] *= (1 - tau)
44 |         target_params[param_name].array[:] += tau * param.array
45 | 
46 |     # Soft-copy Batch Normalization's statistics
47 |     target_links = dict(target_link.namedlinks())
48 |     for link_name, link in source_link.namedlinks():
49 |         if isinstance(link, L.BatchNormalization):
50 |             target_bn = target_links[link_name]
51 |             target_bn.avg_mean[:] *= (1 - tau)
52 |             target_bn.avg_mean[:] += tau * link.avg_mean
53 |             target_bn.avg_var[:] *= (1 - tau)
54 |             target_bn.avg_var[:] += tau * link.avg_var
55 | 
56 | 
57 | def copy_grad(target_link, source_link):
58 |     """Copy gradients of a link to another link."""
59 |     target_params = dict(target_link.namedparams())
60 |     for param_name, param in source_link.namedparams():
61 |         target_params[param_name].grad[:] = param.grad
62 | 
63 | 
64 | def synchronize_parameters(src, dst, method, tau=None):
65 |     {'hard': lambda: copy_param(dst, src),
66 |      'soft': lambda: soft_copy_param(dst, src, tau),
67 |      }[method]()
68 | 


--------------------------------------------------------------------------------
/tests/links_tests/test_noisy_linear.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import chainer
 4 | from chainer import cuda
 5 | from chainer import testing
 6 | from chainer.testing import attr
 7 | from chainer.testing import condition
 8 | import numpy
 9 | 
10 | from chainerrl.links import noisy_linear
11 | 
12 | 
13 | @testing.parameterize(*testing.product({
14 |     'size_args': [
15 |         (5,),  # uninitialized from Chainer v2
16 |         (None, 5),  # uninitialized
17 |         (6, 5),  # initialized
18 |     ],
19 |     'nobias': [False, True],
20 | }))
21 | class TestFactorizedNoisyLinear(unittest.TestCase):
22 |     def setUp(self):
23 |         mu = chainer.links.Linear(*self.size_args, nobias=self.nobias)
24 |         self.linear = noisy_linear.FactorizedNoisyLinear(mu)
25 | 
26 |     def _test_calls(self, xp):
27 |         x_data = xp.arange(12).astype(numpy.float32).reshape((2, 6))
28 |         x = chainer.Variable(x_data)
29 |         self.linear(x)
30 |         self.linear(x_data + 1)
31 |         self.linear(x_data.reshape((2, 3, 2)))
32 | 
33 |     def test_calls_cpu(self):
34 |         self._test_calls(numpy)
35 | 
36 |     @attr.gpu
37 |     def test_calls_gpu(self):
38 |         self.linear.to_gpu(0)
39 |         self._test_calls(cuda.cupy)
40 | 
41 |     @attr.gpu
42 |     def test_calls_gpu_after_to_gpu(self):
43 |         mu = self.linear.mu
44 |         mu.to_gpu(0)
45 |         self.linear = noisy_linear.FactorizedNoisyLinear(mu)
46 |         self._test_calls(cuda.cupy)
47 | 
48 |     def _test_randomness(self, xp):
49 |         x = xp.random.standard_normal((10, 6)).astype(numpy.float32)
50 |         y1 = self.linear(x).array
51 |         y2 = self.linear(x).array
52 |         d = float(xp.mean(xp.square(y1 - y2)))
53 | 
54 |         # The parameter name suggests that
55 |         # xp.sqrt(d / 2) is approx to sigma_scale = 0.4
56 |         # In fact, (for each element _[i, j],) it holds:
57 |         # \E[(y2 - y1) ** 2] = 2 * \Var(y) = (4 / pi) * sigma_scale ** 2
58 | 
59 |         target = (0.4 ** 2) * 2
60 |         if self.nobias:
61 |             target *= 2 / numpy.pi
62 |         else:
63 |             target *= 2 / numpy.pi + numpy.sqrt(2 / numpy.pi) / y1.shape[1]
64 | 
65 |         self.assertGreater(d, target / 3.)
66 |         self.assertLess(d, target * 3.)
67 | 
68 |     @condition.retry(3)
69 |     def test_randomness_cpu(self):
70 |         self._test_randomness(numpy)
71 | 
72 |     @attr.gpu
73 |     @condition.retry(3)
74 |     def test_randomness_gpu(self):
75 |         self.linear.to_gpu(0)
76 |         self._test_randomness(cuda.cupy)
77 | 
78 |     def _test_non_randomness(self, xp):
79 |         # Noises should be the same in a batch
80 |         x0 = xp.random.standard_normal((1, 6)).astype(numpy.float32)
81 |         x = xp.broadcast_to(x0, (2, 6))
82 |         y = self.linear(x).array
83 |         xp.testing.assert_allclose(y[0], y[1], rtol=1e-4)
84 | 
85 |     def test_non_randomness_cpu(self):
86 |         self._test_non_randomness(numpy)
87 | 
88 |     @attr.gpu
89 |     def test_non_randomness_gpu(self):
90 |         self.linear.to_gpu(0)
91 |         self._test_non_randomness(cuda.cupy)
92 | 


--------------------------------------------------------------------------------
/examples/atari/dqn/README.md:
--------------------------------------------------------------------------------
 1 | # DQN
 2 | This example trains a DQN agent, from the following paper: [Human-level control through Deep Reinforcement Learning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf). 
 3 | 
 4 | ## Requirements
 5 | 
 6 | - atari_py>=0.1.1
 7 | - opencv-python
 8 | 
 9 | ## Running the Example
10 | 
11 | ```
12 | python train_dqn.py [options]
13 | ```
14 | 
15 | ### Useful Options
16 | - `--gpu`. Specifies the GPU. If you do not have a GPU on your machine, run the example with the option `--gpu -1`. E.g. `python train_dqn.py --gpu -1`.
17 | - `--env`. Specifies the environment. 
18 | - `--render`. Add this option to render the states in a GUI window.
19 | - `--seed`. This option specifies the random seed used.
20 | - `--outdir` This option specifies the output directory to which the results are written.
21 | 
22 | To view the full list of options, either view the code or run the example with the `--help` option.
23 | 
24 | ## Results
25 | These results reflect ChainerRL  `v0.5.0`.
26 | 
27 | | Game        | Score           | Reported Scores |           
28 | | ------------- |:-------------:|:-------------:|
29 | | AirRaid | N/A| N/A|
30 | | Alien | N/A| **3069**|
31 | | Amidar | N/A| **739.5**|
32 | | Assault | N/A| **3359**|
33 | | Asterix | N/A| **6012**|
34 | | Asteroids | N/A| **1629**|
35 | | Atlantis | N/A| **85641**|
36 | | Bank Heist | N/A| **429.7**|
37 | | Battlezone | N/A| **26300**|
38 | | Beamrider | N/A| **6846**|
39 | | Berzerk | N/A| N/A|
40 | | Bowling | N/A| **42.4**|
41 | | Boxing | N/A| **71.8**|
42 | | Breakout | N/A| **401.2**|
43 | | Carnival | N/A| N/A|
44 | | Centipede | N/A| **8309**|
45 | | Chopper Command | N/A| **6687**|
46 | | Crazy Climber | N/A| **114103**|
47 | | Demon Attack | N/A| **9711**|
48 | | Double Dunk | N/A| **-18.1**|
49 | | Elevator Action | N/A| N/A|
50 | | Enduro | N/A| **301.8**|
51 | | Fishing Derby | N/A| **-0.8**|
52 | | Freeway | N/A| **30.3**|
53 | | Frostbite | N/A| **328.3**|
54 | | Gopher | N/A| **8520**|
55 | | Gravitar | N/A| **306.7**|
56 | | H.E.R.O. | N/A| **19950**|
57 | | Ice Hockey | N/A| **-1.6**|
58 | | James Bond 007 | N/A| **576.7**|
59 | | Journey Escape | N/A| N/A|
60 | | Kangaroo | N/A| **6740**|
61 | | Krull | N/A| **3805**|
62 | | Kung-Fu Master | N/A| **23270**|
63 | | Montezuma's Revenge | N/A| **0**|
64 | | Ms. Pac-Man | N/A| **2311**|
65 | | Name This Game | N/A| **7257**|
66 | | Phoenix | N/A| N/A|
67 | | Pitfall II | N/A| N/A|
68 | | Pitfall! | N/A| N/A|
69 | | Pong | N/A| **18.9**|
70 | | Pooyan | N/A| N/A|
71 | | Private Eye | N/A| **1788**|
72 | | Qbert | N/A| **10596**|
73 | | River Raid | N/A| **8316**|
74 | | Road Runner | N/A| **18257**|
75 | | Robot Tank | N/A| **51.6**|
76 | | Seaquest | N/A| **5286**|
77 | | Skiing | N/A| N/A|
78 | | Solaris | N/A| N/A|
79 | | Space Invaders | N/A| **1976**|
80 | | Stargunner | N/A| **57997**|
81 | | Tennis | N/A| **-2.5**|
82 | | Time Pilot | N/A| **5947**|
83 | | Tutankham | N/A| **186.7**|
84 | | Up’n Down | N/A| **8456**|
85 | | Venture | N/A| **380.0**|
86 | | Video Pinball | N/A| **42684**|
87 | | WizardOfWor | N/A| **3393**|
88 | | YarsRevenge | N/A| N/A|
89 | | Zaxxon | N/A| **4977**|
90 | 
91 | 						


--------------------------------------------------------------------------------
/tests/test_ale.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import unicode_literals
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import random
10 | import sys
11 | import tempfile
12 | import unittest
13 | 
14 | import numpy as np
15 | from PIL import Image
16 | 
17 | from chainerrl.envs import ale
18 | 
19 | 
20 | class TestALE(unittest.TestCase):
21 | 
22 |     def setUp(self):
23 |         pass
24 | 
25 |     def test_state(self):
26 |         env = ale.ALE('breakout')
27 |         self.assertEqual(len(env.state), 4)
28 |         for s in env.state:
29 |             self.assertEqual(s.shape, (84, 84))
30 |             self.assertEqual(s.dtype, np.uint8)
31 | 
32 |     def test_episode(self):
33 |         env = ale.ALE('breakout')
34 |         self.assertFalse(env.is_terminal)
35 |         last_state = env.state
36 |         while not env.is_terminal:
37 | 
38 |             # test state
39 |             self.assertEqual(len(env.state), 4)
40 |             for s in env.state:
41 |                 self.assertEqual(s.shape, (84, 84))
42 |                 self.assertEqual(s.dtype, np.uint8)
43 | 
44 |             print('state (sum)', sum(env.state).sum())
45 | 
46 |             legal_actions = env.legal_actions
47 |             print('legal_actions:', legal_actions)
48 |             self.assertGreater(len(legal_actions), 0)
49 |             a = random.randrange(len(legal_actions))
50 |             print('a', a)
51 |             env.receive_action(a)
52 |             if not env.is_terminal:
53 |                 np.testing.assert_array_equal(
54 |                     np.asarray(last_state[1:]), np.asarray(env.state[:3]))
55 |             last_state = env.state
56 | 
57 |     def test_current_screen(self):
58 |         env = ale.ALE('breakout')
59 |         tempdir = tempfile.mkdtemp()
60 |         print('tempdir: {}'.format(tempdir), file=sys.stderr)
61 |         for episode in range(6):
62 |             env.initialize()
63 |             t = 0
64 |             while not env.is_terminal:
65 |                 for i in range(4):
66 |                     screen = env.state[i]
67 |                     self.assertEqual(screen.dtype, np.uint8)
68 |                     img = Image.fromarray(screen, mode='L')
69 |                     filename = '{}/{}_{}_{}.bmp'.format(
70 |                         tempdir, str(episode).zfill(6), str(t).zfill(6), i)
71 |                     img.save(filename)
72 |                 legal_actions = env.legal_actions
73 |                 a = random.randrange(len(legal_actions))
74 |                 env.receive_action(a)
75 |                 t += 1
76 | 
77 |     def test_reward(self):
78 |         env = ale.ALE('pong')
79 |         for episode in range(3):
80 |             total_r = 0
81 |             while not env.is_terminal:
82 |                 a = random.randrange(len(env.legal_actions))
83 |                 env.receive_action(a)
84 |                 total_r += env.reward
85 |             self.assertGreater(total_r, -22)
86 |             self.assertLess(total_r, -15)
87 |             env.initialize()
88 | 
89 |     def test_seed(self):
90 |         ale.ALE('breakout', seed=0)
91 |         ale.ALE('breakout', seed=2 ** 31 - 1)
92 | 


--------------------------------------------------------------------------------
/tests/links_tests/test_empirical_normalization.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import chainer
 4 | from chainer import testing
 5 | import numpy as np
 6 | 
 7 | from chainerrl.links import empirical_normalization
 8 | 
 9 | 
10 | class TestEmpiricalNormalization(unittest.TestCase):
11 |     def test_small_cpu(self):
12 |         self._test_small(gpu=-1)
13 | 
14 |     @testing.attr.gpu
15 |     def test_small_gpu(self):
16 |         self._test_small(gpu=0)
17 | 
18 |     def _test_small(self, gpu):
19 |         en = empirical_normalization.EmpiricalNormalization(10)
20 |         if gpu >= 0:
21 |             chainer.cuda.get_device_from_id(gpu).use()
22 |             en.to_gpu()
23 | 
24 |         xp = en.xp
25 | 
26 |         xs = []
27 |         for t in range(10):
28 |             x = xp.random.normal(loc=4, scale=2, size=(t + 3, 10))
29 |             en(x)
30 |             xs.extend(list(x))
31 |         xs = xp.stack(xs)
32 |         true_mean = xp.mean(xs, axis=0)
33 |         true_std = xp.std(xs, axis=0)
34 |         xp.testing.assert_allclose(en.mean, true_mean, rtol=1e-4)
35 |         xp.testing.assert_allclose(en.std, true_std, rtol=1e-4)
36 | 
37 |     @testing.attr.slow
38 |     def test_large(self):
39 |         en = empirical_normalization.EmpiricalNormalization(10)
40 |         for _ in range(10000):
41 |             x = np.random.normal(loc=4, scale=2, size=(7, 10))
42 |             en(x)
43 |         x = 2 * np.random.normal(loc=4, scale=2, size=(1, 10))
44 |         enx = en(x, update=False)
45 | 
46 |         np.testing.assert_allclose(en.mean, 4, rtol=1e-1)
47 |         np.testing.assert_allclose(en.std, 2, rtol=1e-1)
48 | 
49 |         # Compare with the ground-truth normalization
50 |         np.testing.assert_allclose((x - 4) / 2, enx, rtol=1e-1)
51 | 
52 |         # Test inverse
53 |         np.testing.assert_allclose(x, en.inverse(enx), rtol=1e-4)
54 | 
55 |     def test_batch_axis(self):
56 |         shape = (2, 3, 4)
57 |         for batch_axis in range(3):
58 |             en = empirical_normalization.EmpiricalNormalization(
59 |                 shape=shape[:batch_axis] + shape[batch_axis + 1:],
60 |                 batch_axis=batch_axis,
61 |             )
62 |             for _ in range(10):
63 |                 x = np.random.rand(*shape)
64 |                 en(x)
65 | 
66 |     def test_until(self):
67 |         en = empirical_normalization.EmpiricalNormalization(7, until=20)
68 |         last_mean = None
69 |         last_std = None
70 |         for t in range(15):
71 |             en(np.random.rand(2, 7) + t)
72 | 
73 |             if 1 <= t < 10:
74 |                 self.assertFalse(np.allclose(en.mean, last_mean, rtol=1e-4))
75 |                 self.assertFalse(np.allclose(en.std, last_std, rtol=1e-4))
76 |             elif t >= 10:
77 |                 np.testing.assert_allclose(en.mean, last_mean, rtol=1e-4)
78 |                 np.testing.assert_allclose(en.std, last_std, rtol=1e-4)
79 | 
80 |             last_mean = en.mean
81 |             last_std = en.std
82 | 
83 |     def test_mixed_inputs(self):
84 |         en = empirical_normalization.EmpiricalNormalization(7)
85 |         for t in range(5):
86 |             y = en(np.random.rand(t + 1, 7))
87 |             self.assertIsInstance(y, np.ndarray)
88 |             y = en(chainer.Variable(np.random.rand(t + 1, 7)))
89 |             self.assertIsInstance(y, chainer.Variable)
90 | 


--------------------------------------------------------------------------------
/tests/experiments_tests/test_train_agent.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | import tempfile
 9 | import unittest
10 | 
11 | import mock
12 | 
13 | import chainerrl
14 | 
15 | 
16 | class TestTrainAgent(unittest.TestCase):
17 | 
18 |     def test(self):
19 | 
20 |         outdir = tempfile.mkdtemp()
21 | 
22 |         agent = mock.Mock()
23 |         env = mock.Mock()
24 |         # Reaches the terminal state after five actions
25 |         env.reset.side_effect = [('state', 0)]
26 |         env.step.side_effect = [
27 |             (('state', 1), 0, False, {}),
28 |             (('state', 2), 0, False, {}),
29 |             (('state', 3), -0.5, False, {}),
30 |             (('state', 4), 0, False, {}),
31 |             (('state', 5), 1, True, {}),
32 |         ]
33 |         hook = mock.Mock()
34 | 
35 |         chainerrl.experiments.train_agent(
36 |             agent=agent,
37 |             env=env,
38 |             steps=5,
39 |             outdir=outdir,
40 |             step_hooks=[hook])
41 | 
42 |         self.assertEqual(agent.act_and_train.call_count, 5)
43 |         self.assertEqual(agent.stop_episode_and_train.call_count, 1)
44 | 
45 |         self.assertEqual(env.reset.call_count, 1)
46 |         self.assertEqual(env.step.call_count, 5)
47 | 
48 |         self.assertEqual(hook.call_count, 5)
49 |         # A hook receives (env, agent, step)
50 |         for i, call in enumerate(hook.call_args_list):
51 |             args, kwargs = call
52 |             self.assertEqual(args[0], env)
53 |             self.assertEqual(args[1], agent)
54 |             # step starts with 1
55 |             self.assertEqual(args[2], i + 1)
56 | 
57 |     def test_needs_reset(self):
58 | 
59 |         outdir = tempfile.mkdtemp()
60 | 
61 |         agent = mock.Mock()
62 |         env = mock.Mock()
63 |         # First episode: 0 -> 1 -> 2 -> 3 (reset)
64 |         # Second episode: 4 -> 5 -> 6 -> 7 (done)
65 |         env.reset.side_effect = [('state', 0), ('state', 4)]
66 |         env.step.side_effect = [
67 |             (('state', 1), 0, False, {}),
68 |             (('state', 2), 0, False, {}),
69 |             (('state', 3), 0, False, {'needs_reset': True}),
70 |             (('state', 5), -0.5, False, {}),
71 |             (('state', 6), 0, False, {}),
72 |             (('state', 7), 1, True, {}),
73 |         ]
74 |         hook = mock.Mock()
75 | 
76 |         chainerrl.experiments.train_agent(
77 |             agent=agent,
78 |             env=env,
79 |             steps=5,
80 |             outdir=outdir,
81 |             step_hooks=[hook])
82 | 
83 |         self.assertEqual(agent.act_and_train.call_count, 5)
84 |         self.assertEqual(agent.stop_episode_and_train.call_count, 2)
85 | 
86 |         self.assertEqual(env.reset.call_count, 2)
87 |         self.assertEqual(env.step.call_count, 5)
88 | 
89 |         self.assertEqual(hook.call_count, 5)
90 |         # A hook receives (env, agent, step)
91 |         for i, call in enumerate(hook.call_args_list):
92 |             args, kwargs = call
93 |             self.assertEqual(args[0], env)
94 |             self.assertEqual(args[1], agent)
95 |             # step starts with 1
96 |             self.assertEqual(args[2], i + 1)
97 | 


--------------------------------------------------------------------------------
/chainerrl/explorers/epsilon_greedy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | from logging import getLogger
10 | 
11 | import numpy as np
12 | 
13 | from chainerrl import explorer
14 | 
15 | 
16 | def select_action_epsilon_greedily(epsilon, random_action_func,
17 |                                    greedy_action_func):
18 |     if np.random.rand() < epsilon:
19 |         return random_action_func(), False
20 |     else:
21 |         return greedy_action_func(), True
22 | 
23 | 
24 | class ConstantEpsilonGreedy(explorer.Explorer):
25 |     """Epsilon-greedy with constant epsilon.
26 | 
27 |     Args:
28 |       epsilon: epsilon used
29 |       random_action_func: function with no argument that returns action
30 |       logger: logger used
31 |     """
32 | 
33 |     def __init__(self, epsilon, random_action_func,
34 |                  logger=getLogger(__name__)):
35 |         assert epsilon >= 0 and epsilon <= 1
36 |         self.epsilon = epsilon
37 |         self.random_action_func = random_action_func
38 |         self.logger = logger
39 | 
40 |     def select_action(self, t, greedy_action_func, action_value=None):
41 |         a, greedy = select_action_epsilon_greedily(
42 |             self.epsilon, self.random_action_func, greedy_action_func)
43 |         greedy_str = 'greedy' if greedy else 'non-greedy'
44 |         self.logger.debug('t:%s a:%s %s', t, a, greedy_str)
45 |         return a
46 | 
47 |     def __repr__(self):
48 |         return 'ConstantEpsilonGreedy(epsilon={})'.format(self.epsilon)
49 | 
50 | 
51 | class LinearDecayEpsilonGreedy(explorer.Explorer):
52 |     """Epsilon-greedy with linearyly decayed epsilon
53 | 
54 |     Args:
55 |       start_epsilon: max value of epsilon
56 |       end_epsilon: min value of epsilon
57 |       decay_steps: how many steps it takes for epsilon to decay
58 |       random_action_func: function with no argument that returns action
59 |       logger: logger used
60 |     """
61 | 
62 |     def __init__(self, start_epsilon, end_epsilon,
63 |                  decay_steps, random_action_func, logger=getLogger(__name__)):
64 |         assert start_epsilon >= 0 and start_epsilon <= 1
65 |         assert end_epsilon >= 0 and end_epsilon <= 1
66 |         assert decay_steps >= 0
67 |         self.start_epsilon = start_epsilon
68 |         self.end_epsilon = end_epsilon
69 |         self.decay_steps = decay_steps
70 |         self.random_action_func = random_action_func
71 |         self.logger = logger
72 |         self.epsilon = start_epsilon
73 | 
74 |     def compute_epsilon(self, t):
75 |         if t > self.decay_steps:
76 |             return self.end_epsilon
77 |         else:
78 |             epsilon_diff = self.end_epsilon - self.start_epsilon
79 |             return self.start_epsilon + epsilon_diff * (t / self.decay_steps)
80 | 
81 |     def select_action(self, t, greedy_action_func, action_value=None):
82 |         self.epsilon = self.compute_epsilon(t)
83 |         a, greedy = select_action_epsilon_greedily(
84 |             self.epsilon, self.random_action_func, greedy_action_func)
85 |         greedy_str = 'greedy' if greedy else 'non-greedy'
86 |         self.logger.debug('t:%s a:%s %s', t, a, greedy_str)
87 |         return a
88 | 
89 |     def __repr__(self):
90 |         return 'LinearDecayEpsilonGreedy(epsilon={})'.format(self.epsilon)
91 | 


--------------------------------------------------------------------------------
/tests/envs_tests/test_vector_envs.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import unittest
10 | 
11 | from chainer import testing
12 | import gym
13 | import numpy as np
14 | 
15 | import chainerrl
16 | 
17 | 
18 | @testing.parameterize(*testing.product({
19 |     'num_envs': [1, 2, 3],
20 |     'env_id': ['CartPole-v0', 'Pendulum-v0'],
21 |     'random_seed_offset': [0, 100],
22 |     'vector_env_to_test': ['SerialVectorEnv', 'MultiprocessVectorEnv'],
23 | }))
24 | class TestSerialVectorEnv(unittest.TestCase):
25 | 
26 |     def setUp(self):
27 |         # Init VectorEnv to test
28 |         if self.vector_env_to_test == 'SerialVectorEnv':
29 |             self.vec_env = chainerrl.envs.SerialVectorEnv(
30 |                 [gym.make(self.env_id) for _ in range(self.num_envs)])
31 |         elif self.vector_env_to_test == 'MultiprocessVectorEnv':
32 |             self.vec_env = chainerrl.envs.MultiprocessVectorEnv(
33 |                 [(lambda: gym.make(self.env_id))
34 |                  for _ in range(self.num_envs)])
35 |         else:
36 |             assert False
37 |         # Init envs to compare against
38 |         self.envs = [gym.make(self.env_id) for _ in range(self.num_envs)]
39 | 
40 |     def tearDown(self):
41 |         # Delete so that all the subprocesses are joined
42 |         del self.vec_env
43 | 
44 |     def test_num_envs(self):
45 |         self.assertEqual(self.vec_env.num_envs, self.num_envs)
46 | 
47 |     def test_action_space(self):
48 |         self.assertEqual(self.vec_env.action_space, self.envs[0].action_space)
49 | 
50 |     def test_observation_space(self):
51 |         self.assertEqual(
52 |             self.vec_env.observation_space, self.envs[0].observation_space)
53 | 
54 |     def test_seed_reset_and_step(self):
55 |         # seed
56 |         seeds = [self.random_seed_offset + i for i in range(self.num_envs)]
57 |         self.vec_env.seed(seeds)
58 |         for env, seed in zip(self.envs, seeds):
59 |             env.seed(seed)
60 | 
61 |         # reset
62 |         obss = self.vec_env.reset()
63 |         real_obss = [env.reset() for env in self.envs]
64 |         np.testing.assert_allclose(obss, real_obss)
65 | 
66 |         # step
67 |         actions = [env.action_space.sample() for env in self.envs]
68 |         real_obss, real_rewards, real_dones, real_infos = zip(*[
69 |             env.step(action) for env, action in zip(self.envs, actions)])
70 |         obss, rewards, dones, infos = self.vec_env.step(actions)
71 |         np.testing.assert_allclose(obss, real_obss)
72 |         self.assertEqual(rewards, real_rewards)
73 |         self.assertEqual(dones, real_dones)
74 |         self.assertEqual(infos, real_infos)
75 | 
76 |         # reset with full mask should have no effect
77 |         mask = np.ones(self.num_envs)
78 |         obss = self.vec_env.reset(mask)
79 |         np.testing.assert_allclose(obss, real_obss)
80 | 
81 |         # reset with partial mask
82 |         mask = np.zeros(self.num_envs)
83 |         mask[-1] = 1
84 |         obss = self.vec_env.reset(mask)
85 |         real_obss = list(real_obss)
86 |         for i in range(self.num_envs):
87 |             if not mask[i]:
88 |                 real_obss[i] = self.envs[i].reset()
89 |         np.testing.assert_allclose(obss, real_obss)
90 | 
91 | 
92 | testing.run_module(__name__, __file__)
93 | 


--------------------------------------------------------------------------------
/tests/test_agent.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import unicode_literals
  3 | from __future__ import division
  4 | from __future__ import absolute_import
  5 | from builtins import *  # NOQA
  6 | from future import standard_library
  7 | standard_library.install_aliases()  # NOQA
  8 | 
  9 | import os
 10 | import tempfile
 11 | import unittest
 12 | 
 13 | import chainer
 14 | import numpy as np
 15 | 
 16 | import chainerrl
 17 | 
 18 | 
 19 | def create_simple_link():
 20 |     link = chainer.Link()
 21 |     with link.init_scope():
 22 |         link.param = chainer.Parameter(np.zeros(1))
 23 |     return link
 24 | 
 25 | 
 26 | class Parent(chainerrl.agent.AttributeSavingMixin, object):
 27 | 
 28 |     saved_attributes = ['link', 'child']
 29 | 
 30 |     def __init__(self):
 31 |         self.link = create_simple_link()
 32 |         self.child = Child()
 33 | 
 34 | 
 35 | class Child(chainerrl.agent.AttributeSavingMixin, object):
 36 | 
 37 |     saved_attributes = ['link']
 38 | 
 39 |     def __init__(self):
 40 |         self.link = create_simple_link()
 41 | 
 42 | 
 43 | class Parent2(chainerrl.agent.AttributeSavingMixin, object):
 44 | 
 45 |     saved_attributes = ['child_a', 'child_b']
 46 | 
 47 |     def __init__(self, child_a, child_b):
 48 |         self.child_a = child_a
 49 |         self.child_b = child_b
 50 | 
 51 | 
 52 | class TestAttributeSavingMixin(unittest.TestCase):
 53 | 
 54 |     def test_save_load(self):
 55 |         parent = Parent()
 56 |         parent.link.param.array[:] = 1
 57 |         parent.child.link.param.array[:] = 2
 58 |         # Save
 59 |         dirname = tempfile.mkdtemp()
 60 |         parent.save(dirname)
 61 |         self.assertTrue(os.path.isdir(dirname))
 62 |         self.assertTrue(os.path.isfile(os.path.join(dirname, 'link.npz')))
 63 |         self.assertTrue(os.path.isdir(os.path.join(dirname, 'child')))
 64 |         self.assertTrue(os.path.isfile(
 65 |             os.path.join(dirname, 'child', 'link.npz')))
 66 |         # Load
 67 |         parent = Parent()
 68 |         self.assertEqual(int(parent.link.param.array), 0)
 69 |         self.assertEqual(int(parent.child.link.param.array), 0)
 70 |         parent.load(dirname)
 71 |         self.assertEqual(int(parent.link.param.array), 1)
 72 |         self.assertEqual(int(parent.child.link.param.array), 2)
 73 | 
 74 |     def test_save_load_2(self):
 75 |         parent = Parent()
 76 |         parent2 = Parent2(parent.child, parent)
 77 |         # Save
 78 |         dirname = tempfile.mkdtemp()
 79 |         parent2.save(dirname)
 80 |         # Load
 81 |         parent = Parent()
 82 |         parent2 = Parent2(parent.child, parent)
 83 |         parent2.load(dirname)
 84 | 
 85 |     def test_loop1(self):
 86 |         parent = Parent()
 87 |         parent.child = parent
 88 |         dirname = tempfile.mkdtemp()
 89 | 
 90 |         # The assertion in ChainerRL should fail on save().
 91 |         # Otherwise it seems to raise OSError: [Errno 63] File name too long
 92 |         with self.assertRaises(AssertionError):
 93 |             parent.save(dirname)
 94 | 
 95 |     def test_loop2(self):
 96 |         parent1 = Parent()
 97 |         parent2 = Parent()
 98 |         parent1.child = parent2
 99 |         parent2.child = parent1
100 |         dirname = tempfile.mkdtemp()
101 | 
102 |         # The assertion in ChainerRL should fail on save().
103 |         # Otherwise it seems to raise OSError: [Errno 63] File name too long
104 |         with self.assertRaises(AssertionError):
105 |             parent1.save(dirname)
106 | 


--------------------------------------------------------------------------------
/tests/misc_tests/test_draw_computational_graph.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | from __future__ import absolute_import
 5 | from future import standard_library
 6 | standard_library.install_aliases()  # NOQA
 7 | 
 8 | import os
 9 | import tempfile
10 | import unittest
11 | 
12 | import chainer
13 | from chainer import testing
14 | import numpy as np
15 | 
16 | import chainerrl
17 | 
18 | 
19 | _v = chainer.Variable(np.zeros(5))
20 | _dav = chainerrl.action_value.DiscreteActionValue(
21 |     chainer.Variable(np.zeros((5, 5))))
22 | _qav = chainerrl.action_value.QuadraticActionValue(
23 |     chainer.Variable(np.zeros((5, 5), dtype=np.float32)),
24 |     chainer.Variable(np.ones((5, 5, 5), dtype=np.float32)),
25 |     chainer.Variable(np.zeros((5, 1), dtype=np.float32)),
26 | )
27 | _sdis = chainerrl.distribution.SoftmaxDistribution(
28 |     chainer.Variable(np.zeros((5, 5))))
29 | _gdis = chainerrl.distribution.GaussianDistribution(
30 |     chainer.Variable(np.zeros((5, 5), dtype=np.float32)),
31 |     chainer.Variable(np.ones((5, 5), dtype=np.float32)))
32 | 
33 | 
34 | @testing.parameterize(
35 |     {'obj': [], 'expected': []},
36 |     {'obj': (), 'expected': []},
37 |     {'obj': _v, 'expected': [_v]},
38 |     {'obj': _dav, 'expected': list(_dav.params)},
39 |     {'obj': _qav, 'expected': list(_qav.params)},
40 |     {'obj': _sdis, 'expected': list(_sdis.params)},
41 |     {'obj': _gdis, 'expected': list(_gdis.params)},
42 |     {'obj': [_v, _dav, _sdis],
43 |         'expected': [_v] + list(_dav.params) + list(_sdis.params)},
44 | )
45 | class TestCollectVariables(unittest.TestCase):
46 | 
47 |     def _assert_eq_var_list(self, a, b):
48 |         # Equality between two Variable lists
49 |         self.assertEqual(len(a), len(b))
50 |         self.assertTrue(isinstance(a, list))
51 |         self.assertTrue(isinstance(b, list))
52 |         for item in a:
53 |             self.assertTrue(isinstance(item, chainer.Variable))
54 |         for item in b:
55 |             self.assertTrue(isinstance(item, chainer.Variable))
56 |         for va, vb in zip(a, b):
57 |             self.assertEqual(id(va), id(vb))
58 | 
59 |     def test_collect_variables(self):
60 |         vs = chainerrl.misc.collect_variables(self.obj)
61 |         self._assert_eq_var_list(vs, self.expected)
62 | 
63 |         # Wrap by a list
64 |         vs = chainerrl.misc.collect_variables([self.obj])
65 |         self._assert_eq_var_list(vs, self.expected)
66 | 
67 |         # Wrap by two lists
68 |         vs = chainerrl.misc.collect_variables([[self.obj]])
69 |         self._assert_eq_var_list(vs, self.expected)
70 | 
71 |         # Wrap by a tuple
72 |         vs = chainerrl.misc.collect_variables((self.obj,))
73 |         self._assert_eq_var_list(vs, self.expected)
74 | 
75 |         # Wrap by a two tuples
76 |         vs = chainerrl.misc.collect_variables(((self.obj,),))
77 |         self._assert_eq_var_list(vs, self.expected)
78 | 
79 | 
80 | class TestDrawComputationalGraph(unittest.TestCase):
81 | 
82 |     def test_draw_computational_graph(self):
83 |         x = chainer.Variable(np.zeros(5))
84 |         y = x ** 2 + chainer.Variable(np.ones(5))
85 |         dirname = tempfile.mkdtemp()
86 |         filepath = os.path.join(dirname, 'graph')
87 |         chainerrl.misc.draw_computational_graph(y, filepath)
88 |         self.assertTrue(os.path.exists(filepath + '.gv'))
89 |         if chainerrl.misc.is_graphviz_available():
90 |             self.assertTrue(os.path.exists(filepath + '.png'))
91 |         else:
92 |             self.assertFalse(os.path.exists(filepath + '.png'))
93 | 


--------------------------------------------------------------------------------
/chainerrl/links/mlp_bn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import unicode_literals
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from builtins import *  # NOQA
 6 | from future import standard_library
 7 | standard_library.install_aliases()  # NOQA
 8 | 
 9 | import chainer
10 | from chainer import functions as F
11 | from chainer import links as L
12 | 
13 | from chainerrl.initializers import LeCunNormal
14 | 
15 | 
16 | class LinearBN(chainer.Chain):
17 |     """Linear layer with BatchNormalization."""
18 | 
19 |     def __init__(self, in_size, out_size):
20 |         super().__init__()
21 |         with self.init_scope():
22 |             self.linear = L.Linear(in_size, out_size)
23 |             bn = L.BatchNormalization(out_size)
24 |             bn.avg_var[:] = 1
25 |             self.bn = bn
26 | 
27 |     def __call__(self, x):
28 |         return self.bn(self.linear(x))
29 | 
30 | 
31 | class MLPBN(chainer.Chain):
32 |     """Multi-Layer Perceptron with Batch Normalization.
33 | 
34 |     Args:
35 |         in_size (int): Input size.
36 |         out_size (int): Output size.
37 |         hidden_sizes (list of ints): Sizes of hidden channels.
38 |         normalize_input (bool): If set to True, Batch Normalization is applied
39 |             to inputs.
40 |         normalize_output (bool): If set to True, Batch Normalization is applied
41 |             to outputs.
42 |         nonlinearity (callable): Nonlinearity between layers. It must accept a
43 |             Variable as an argument and return a Variable with the same shape.
44 |             Nonlinearities with learnable parameters such as PReLU are not
45 |             supported.
46 |         last_wscale (float): Scale of weight initialization of the last layer.
47 |         """
48 | 
49 |     def __init__(self, in_size, out_size, hidden_sizes, normalize_input=True,
50 |                  normalize_output=False, nonlinearity=F.relu, last_wscale=1):
51 |         self.in_size = in_size
52 |         self.out_size = out_size
53 |         self.hidden_sizes = hidden_sizes
54 |         self.normalize_input = normalize_input
55 |         self.normalize_output = normalize_output
56 |         self.nonlinearity = nonlinearity
57 | 
58 |         super().__init__()
59 |         with self.init_scope():
60 |             if normalize_input:
61 |                 self.input_bn = L.BatchNormalization(in_size)
62 |                 self.input_bn.avg_var[:] = 1
63 | 
64 |             if hidden_sizes:
65 |                 hidden_layers = []
66 |                 hidden_layers.append(LinearBN(in_size, hidden_sizes[0]))
67 |                 for hin, hout in zip(hidden_sizes, hidden_sizes[1:]):
68 |                     hidden_layers.append(LinearBN(hin, hout))
69 |                 self.hidden_layers = chainer.ChainList(*hidden_layers)
70 |                 self.output = L.Linear(hidden_sizes[-1], out_size,
71 |                                        initialW=LeCunNormal(last_wscale))
72 |             else:
73 |                 self.output = L.Linear(in_size, out_size,
74 |                                        initialW=LeCunNormal(last_wscale))
75 | 
76 |             if normalize_output:
77 |                 self.output_bn = L.BatchNormalization(out_size)
78 |                 self.output_bn.avg_var[:] = 1
79 | 
80 |     def __call__(self, x):
81 |         h = x
82 |         assert (not chainer.config.train) or x.shape[0] > 1
83 |         if self.normalize_input:
84 |             h = self.input_bn(h)
85 |         if self.hidden_sizes:
86 |             for l in self.hidden_layers:
87 |                 h = self.nonlinearity(l(h))
88 |         h = self.output(h)
89 |         if self.normalize_output:
90 |             h = self.output_bn(h)
91 |         return h
92 | 


--------------------------------------------------------------------------------
/tests/misc_tests/test_random.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import unicode_literals
  3 | from __future__ import division
  4 | from __future__ import absolute_import
  5 | from builtins import *  # NOQA
  6 | from future import standard_library
  7 | standard_library.install_aliases()  # NOQA
  8 | 
  9 | import timeit
 10 | import unittest
 11 | 
 12 | from chainer import testing
 13 | from chainer.testing import condition
 14 | import numpy as np
 15 | from scipy import stats
 16 | 
 17 | from chainerrl.misc.random import sample_n_k
 18 | 
 19 | 
 20 | @testing.parameterize(
 21 |     {'n': 2, 'k': 2},
 22 |     {'n': 5, 'k': 1},
 23 |     {'n': 5, 'k': 4},
 24 |     {'n': 7, 'k': 2},
 25 |     {'n': 20, 'k': 10},
 26 |     {'n': 100, 'k': 5},
 27 |     {'n': 1, 'k': 0},
 28 |     {'n': 0, 'k': 0},
 29 | )
 30 | class TestSampleNK(unittest.TestCase):
 31 |     def test_fast(self):
 32 |         self.samples = [sample_n_k(self.n, self.k) for _ in range(200)]
 33 |         self.subtest_constraints()
 34 | 
 35 |     def subtest_constraints(self):
 36 |         for s in self.samples:
 37 |             self.assertEqual(len(s), self.k)
 38 | 
 39 |             all(0 <= x < self.n for x in s)
 40 | 
 41 |             # distinct
 42 |             t = np.unique(s)
 43 |             self.assertEqual(len(t), self.k)
 44 | 
 45 |     @testing.attr.slow
 46 |     @condition.repeat_with_success_at_least(3, 2)
 47 |     def test_slow(self):
 48 |         self.samples = [sample_n_k(self.n, self.k) for _ in range(100000)]
 49 |         self.subtest_total_counts()
 50 |         self.subtest_order_counts()
 51 | 
 52 |     def subtest_total_counts(self):
 53 |         if self.k in [0, self.n]:
 54 |             return
 55 | 
 56 |         cnt = np.zeros(self.n)
 57 |         for s in self.samples:
 58 |             for x in s:
 59 |                 cnt[x] += 1
 60 | 
 61 |         m = len(self.samples)
 62 | 
 63 |         p = self.k / self.n
 64 |         mean = m * p
 65 |         std = np.sqrt(m * p * (1 - p))
 66 | 
 67 |         self.subtest_normal_distrib(cnt, mean, std)
 68 | 
 69 |     def subtest_order_counts(self):
 70 |         if self.k < 2:
 71 |             return
 72 | 
 73 |         ordered_pairs = [(i, j) for j in range(self.k) for i in range(j)]
 74 |         cnt = np.zeros(len(ordered_pairs))
 75 | 
 76 |         for s in self.samples:
 77 |             for t, (i, j) in enumerate(ordered_pairs):
 78 |                 if s[i] < s[j]:
 79 |                     cnt[t] += 1
 80 | 
 81 |         m = len(self.samples)
 82 | 
 83 |         mean = m / 2
 84 |         std = np.sqrt(m / 4)
 85 | 
 86 |         self.subtest_normal_distrib(cnt, mean, std)
 87 | 
 88 |     def subtest_normal_distrib(self, xs, mean, std):
 89 |         _, pvalue = stats.kstest(xs, 'norm', (mean, std))
 90 |         self.assertGreater(pvalue, 3e-3)
 91 | 
 92 | 
 93 | class TestSampleNKSpeed(unittest.TestCase):
 94 |     def get_timeit(self, setup):
 95 |         return min(timeit.Timer(
 96 |             'for n in range(64, 10000): sample_n_k(n, 64)',
 97 |             setup=setup).  repeat(repeat=10, number=1))
 98 | 
 99 |     @testing.attr.slow
100 |     def _test(self):
101 |         t = self.get_timeit(
102 |             "from chainerrl.misc.random import sample_n_k")
103 | 
104 |         # faster than random.sample
105 |         t1 = self.get_timeit("""
106 | import random
107 | import six
108 | def sample_n_k(n, k):
109 |     return random.sample(six.moves.range(n), k)
110 | """)
111 |         self.assertLess(t, t1)
112 | 
113 |         # faster than np.random.choice(..., replace=False)
114 |         t2 = self.get_timeit("""
115 | import numpy as np
116 | def sample_n_k(n, k):
117 |     return np.random.choice(n, k, replace=False)
118 | """)
119 |         self.assertLess(t, t2)
120 | 


--------------------------------------------------------------------------------
/chainerrl/agents/dpp.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import unicode_literals
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from builtins import *  # NOQA
  6 | from future import standard_library
  7 | standard_library.install_aliases()  # NOQA
  8 | 
  9 | from abc import ABCMeta
 10 | from abc import abstractmethod
 11 | 
 12 | import chainer
 13 | import chainer.functions as F
 14 | from future.utils import with_metaclass
 15 | 
 16 | from chainerrl.agents.dqn import DQN
 17 | 
 18 | 
 19 | class AbstractDPP(with_metaclass(ABCMeta, DQN)):
 20 |     """Dynamic Policy Programming.
 21 | 
 22 |     See: https://arxiv.org/abs/1004.2027.
 23 |     """
 24 | 
 25 |     @abstractmethod
 26 |     def _l_operator(self, qout):
 27 |         raise NotImplementedError()
 28 | 
 29 |     def _compute_target_values(self, exp_batch):
 30 | 
 31 |         batch_next_state = exp_batch['next_state']
 32 | 
 33 |         target_next_qout = self.target_q_function(batch_next_state)
 34 |         next_q_expect = self._l_operator(target_next_qout)
 35 | 
 36 |         batch_rewards = exp_batch['reward']
 37 |         batch_terminal = exp_batch['is_state_terminal']
 38 | 
 39 |         return (batch_rewards +
 40 |                 exp_batch['discount'] * (1 - batch_terminal) * next_q_expect)
 41 | 
 42 |     def _compute_y_and_t(self, exp_batch):
 43 | 
 44 |         batch_state = exp_batch['state']
 45 |         batch_size = len(exp_batch['reward'])
 46 | 
 47 |         qout = self.q_function(batch_state)
 48 | 
 49 |         batch_actions = exp_batch['action']
 50 |         # Q(s_t,a_t)
 51 |         batch_q = F.reshape(qout.evaluate_actions(
 52 |             batch_actions), (batch_size, 1))
 53 | 
 54 |         with chainer.no_backprop_mode():
 55 |             # Compute target values
 56 |             target_qout = self.target_q_function(batch_state)
 57 | 
 58 |             # Q'(s_t,a_t)
 59 |             target_q = F.reshape(target_qout.evaluate_actions(
 60 |                 batch_actions), (batch_size, 1))
 61 | 
 62 |             # LQ'(s_t,a)
 63 |             target_q_expect = F.reshape(
 64 |                 self._l_operator(target_qout), (batch_size, 1))
 65 | 
 66 |             # r + g * LQ'(s_{t+1},a)
 67 |             batch_q_target = F.reshape(
 68 |                 self._compute_target_values(exp_batch), (batch_size, 1))
 69 | 
 70 |             # Q'(s_t,a_t) + r + g * LQ'(s_{t+1},a) - LQ'(s_t,a)
 71 |             t = target_q + batch_q_target - target_q_expect
 72 | 
 73 |         return batch_q, t
 74 | 
 75 | 
 76 | class DPP(AbstractDPP):
 77 |     """Dynamic Policy Programming with softmax operator.
 78 | 
 79 |     Args:
 80 |       eta (float): Positive constant.
 81 | 
 82 |     For other arguments, see DQN.
 83 |     """
 84 | 
 85 |     def __init__(self, *args, **kwargs):
 86 |         self.eta = kwargs.pop('eta', 1.0)
 87 |         super().__init__(*args, **kwargs)
 88 | 
 89 |     def _l_operator(self, qout):
 90 |         return qout.compute_expectation(self.eta)
 91 | 
 92 | 
 93 | class DPPL(AbstractDPP):
 94 |     """Dynamic Policy Programming with L operator.
 95 | 
 96 |     Args:
 97 |       eta (float): Positive constant.
 98 | 
 99 |     For other arguments, see DQN.
100 |     """
101 | 
102 |     def __init__(self, *args, **kwargs):
103 |         self.eta = kwargs.pop('eta', 1.0)
104 |         super().__init__(*args, **kwargs)
105 | 
106 |     def _l_operator(self, qout):
107 |         return F.logsumexp(self.eta * qout.q_values, axis=1) / self.eta
108 | 
109 | 
110 | class DPPGreedy(AbstractDPP):
111 |     """Dynamic Policy Programming with max operator.
112 | 
113 |     This algorithm corresponds to DPP with eta = infinity.
114 |     """
115 | 
116 |     def _l_operator(self, qout):
117 |         return qout.max
118 | 


--------------------------------------------------------------------------------