├── rl
    ├── __init__.py
    ├── agents
    │   ├── __init__.py
    │   ├── cem.py
    │   ├── sarsa.py
    │   └── ddpg.py
    ├── keras_future.py
    ├── random.py
    ├── processors.py
    ├── policy.py
    ├── util.py
    ├── memory.py
    ├── callbacks.py
    └── core.py
├── tests
    ├── __init__.py
    ├── rl
    │   ├── __init__.py
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── test_cem.py
    │   │   ├── test_ddpg.py
    │   │   └── test_dqn.py
    │   ├── util.py
    │   ├── test_util.py
    │   ├── test_core.py
    │   └── test_memory.py
    └── integration
    │   ├── test_continuous.py
    │   └── test_discrete.py
├── docs
    ├── sources
    │   ├── index.md
    │   ├── agents
    │   │   ├── naf.md
    │   │   ├── sarsa.md
    │   │   ├── ddpg.md
    │   │   ├── cem.md
    │   │   ├── dqn.md
    │   │   └── overview.md
    │   ├── processors.md
    │   └── core.md
    ├── templates
    │   ├── index.md
    │   ├── core.md
    │   ├── processors.md
    │   └── agents
    │   │   ├── naf.md
    │   │   ├── ddpg.md
    │   │   ├── sarsa.md
    │   │   ├── cem.md
    │   │   ├── dqn.md
    │   │   └── overview.md
    ├── requirements.txt
    └── autogen.py
├── setup.cfg
├── assets
    ├── breakout.gif
    ├── cartpole.gif
    └── pendulum.gif
├── setup.py
├── mkdocs.yml
├── ISSUE_TEMPLATE.md
├── LICENSE
├── .gitignore
├── pytest.ini
├── examples
    ├── sarsa_cartpole.py
    ├── visualize_log.py
    ├── dqn_cartpole.py
    ├── cem_cartpole.py
    ├── duel_dqn_cartpole.py
    ├── ddpg_pendulum.py
    ├── ddpg_mujoco.py
    ├── naf_pendulum.py
    └── dqn_atari.py
├── .travis.yml
└── README.md


/rl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/sources/index.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/templates/index.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/rl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/rl/agents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/templates/core.md:
--------------------------------------------------------------------------------
1 | {{autogenerated}}
2 | 


--------------------------------------------------------------------------------
/docs/templates/processors.md:
--------------------------------------------------------------------------------
1 | {{autogenerated}}
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | mkdocs
3 | python-markdown-math
4 | 


--------------------------------------------------------------------------------
/assets/breakout.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cafe/keras-rl/master/assets/breakout.gif


--------------------------------------------------------------------------------
/assets/cartpole.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cafe/keras-rl/master/assets/cartpole.gif


--------------------------------------------------------------------------------
/assets/pendulum.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cafe/keras-rl/master/assets/pendulum.gif


--------------------------------------------------------------------------------
/rl/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from .dqn import DQNAgent, NAFAgent, ContinuousDQNAgent
3 | from .ddpg import DDPGAgent
4 | from .cem import CEMAgent
5 | from .sarsa import SarsaAgent, SARSAAgent
6 | 


--------------------------------------------------------------------------------
/docs/templates/agents/naf.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | ---
 4 | 
 5 | {{autogenerated}}
 6 | 
 7 | ---
 8 | 
 9 | ### References
10 | - [Continuous Deep Q-Learning with Model-based Acceleration](https://arxiv.org/abs/1603.00748), Gu et al., 2016
11 | 


--------------------------------------------------------------------------------
/docs/templates/agents/ddpg.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | ---
 4 | 
 5 | {{autogenerated}}
 6 | 
 7 | ---
 8 | 
 9 | ### References
10 | - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971), Lillicrap et al., 2015
11 | 


--------------------------------------------------------------------------------
/docs/templates/agents/sarsa.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | ---
 4 | 
 5 | {{autogenerated}}
 6 | 
 7 | ---
 8 | 
 9 | ### References
10 | - [Reinforcement learning: An introduction](http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf), Sutton and Barto, 2011
11 | 


--------------------------------------------------------------------------------
/docs/templates/agents/cem.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | ---
 4 | 
 5 | {{autogenerated}}
 6 | 
 7 | ---
 8 | 
 9 | ### References
10 | - [Learning Tetris Using the Noisy Cross-Entropy Method](http://www.mitpressjournals.org/doi/abs/10.1162/neco.2006.18.12.2936?journalCode=neco), Szita et al., 2006
11 | - [Deep Reinforcement Learning (MLSS lecture notes)](http://learning.mpi-sws.org/mlss2016/slides/2016-MLSS-RL.pdf), Schulman, 2016
12 | 


--------------------------------------------------------------------------------
/docs/sources/agents/naf.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | ---
 4 | 
 5 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/dqn.py#L548)</span>
 6 | ### NAFAgent
 7 | 
 8 | ```python
 9 | rl.agents.dqn.NAFAgent(V_model, L_model, mu_model, random_process=None, covariance_mode='full')
10 | ```
11 | 
12 | Write me
13 | 
14 | 
15 | ---
16 | 
17 | ### References
18 | - [Continuous Deep Q-Learning with Model-based Acceleration](https://arxiv.org/abs/1603.00748), Gu et al., 2016
19 | 


--------------------------------------------------------------------------------
/docs/sources/agents/sarsa.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | ---
 4 | 
 5 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/sarsa.py#L17)</span>
 6 | ### SARSAAgent
 7 | 
 8 | ```python
 9 | rl.agents.sarsa.SARSAAgent(model, nb_actions, policy=None, test_policy=None, gamma=0.99, nb_steps_warmup=10, train_interval=1, delta_clip=inf)
10 | ```
11 | 
12 | Write me
13 | 
14 | 
15 | ---
16 | 
17 | ### References
18 | - [Reinforcement learning: An introduction](http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf), Sutton and Barto, 2011
19 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from setuptools import find_packages
 3 | 
 4 | 
 5 | setup(name='keras-rl',
 6 |       version='0.3.1',
 7 |       description='Deep Reinforcement Learning for Keras',
 8 |       author='Matthias Plappert',
 9 |       author_email='matthiasplappert@me.com',
10 |       url='https://github.com/matthiasplappert/keras-rl',
11 |       download_url='https://github.com/matthiasplappert/keras-rl/archive/v0.3.1.tar.gz',
12 |       license='MIT',
13 |       install_requires=['keras>=1.0.7,<2.0.7'],
14 |       extras_require={
15 |           'gym': ['gym'],
16 |       },
17 |       packages=find_packages())
18 | 


--------------------------------------------------------------------------------
/docs/templates/agents/dqn.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | ---
 4 | 
 5 | {{autogenerated}}
 6 | 
 7 | ---
 8 | 
 9 | ### References
10 | - [Playing Atari with Deep Reinforcement Learning](https://arxiv.org/abs/1312.5602), Mnih et al., 2013
11 | - [Human-level control through deep reinforcement learning](http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html), Mnih et al., 2015
12 | - [Deep Reinforcement Learning with Double Q-learning](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Applications_files/doubledqn.pdf), van Hasselt et al., 2015
13 | - [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581), Wang et al., 2016
14 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Keras-RL Documentation
 2 | theme: readthedocs
 3 | docs_dir: docs/sources
 4 | repo_url: https://github.com/matthiasplappert/keras-rl
 5 | site_description: 'Documentation for Keras-RL, a library for Deep Reinforcement Learning with Keras.'
 6 | #markdown_extensions: [mdx_math]
 7 | #extra_javascript: ['https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML']
 8 | 
 9 | dev_addr: '0.0.0.0:8000'
10 | 
11 | pages:
12 | - Home: index.md
13 | - Core: core.md
14 | - Agents:
15 |   - Overview: agents/overview.md
16 |   - DQNAgent: agents/dqn.md
17 |   - NAFAgent: agents/naf.md
18 |   - DDPGAgent: agents/ddpg.md
19 |   - SARSAAgent: agents/sarsa.md
20 |   - CEMAgent: agents/cem.md
21 | 


--------------------------------------------------------------------------------
/docs/sources/agents/ddpg.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | ---
 4 | 
 5 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/ddpg.py#L22)</span>
 6 | ### DDPGAgent
 7 | 
 8 | ```python
 9 | rl.agents.ddpg.DDPGAgent(nb_actions, actor, critic, critic_action_input, memory, gamma=0.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, train_interval=1, memory_interval=1, delta_range=None, delta_clip=inf, random_process=None, custom_model_objects={}, target_model_update=0.001)
10 | ```
11 | 
12 | Write me
13 | 
14 | 
15 | ---
16 | 
17 | ### References
18 | - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971), Lillicrap et al., 2015
19 | 


--------------------------------------------------------------------------------
/tests/rl/util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | from rl.core import Env
 5 | 
 6 | 
 7 | class MultiInputTestEnv(Env):
 8 |     def __init__(self, observation_shape):
 9 |         self.observation_shape = observation_shape
10 | 
11 |     def step(self, action):
12 |         return self._get_obs(), random.choice([0, 1]), random.choice([True, False]), {}
13 | 
14 |     def reset(self):
15 |         return self._get_obs()
16 | 
17 |     def _get_obs(self):
18 |         if type(self.observation_shape) is list:
19 |             return [np.random.random(s) for s in self.observation_shape]
20 |         else:
21 |             return np.random.random(self.observation_shape)
22 | 
23 |     def __del__(self):
24 |         pass
25 | 


--------------------------------------------------------------------------------
/rl/keras_future.py:
--------------------------------------------------------------------------------
 1 | import keras
 2 | import keras.layers
 3 | import keras.models
 4 | 
 5 | 
 6 | def concatenate(x):
 7 |     if hasattr(keras.layers, 'Concatenate'):
 8 |         return keras.layers.Concatenate()(x)
 9 |     else:
10 |         return keras.layers.merge(x, mode='concat')
11 | 
12 | 
13 | def add(x):
14 |     if hasattr(keras.layers, 'Add'):
15 |         return keras.layers.Add()(x)
16 |     else:
17 |         return keras.layers.merge(x, mode='sum')
18 | 
19 | 
20 | def Model(input, output, **kwargs):
21 |     if int(keras.__version__.split('.')[0]) >= 2:
22 |         return keras.models.Model(inputs=input, outputs=output, **kwargs)
23 |     else:
24 |         return keras.models.Model(input=input, output=output, **kwargs)
25 | 


--------------------------------------------------------------------------------
/docs/sources/agents/cem.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | ---
 4 | 
 5 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/cem.py#L12)</span>
 6 | ### CEMAgent
 7 | 
 8 | ```python
 9 | rl.agents.cem.CEMAgent(model, nb_actions, memory, batch_size=50, nb_steps_warmup=1000, train_interval=50, elite_frac=0.05, memory_interval=1, theta_init=None, noise_decay_const=0.0, noise_ampl=0.0)
10 | ```
11 | 
12 | Write me
13 | 
14 | 
15 | ---
16 | 
17 | ### References
18 | - [Learning Tetris Using the Noisy Cross-Entropy Method](http://www.mitpressjournals.org/doi/abs/10.1162/neco.2006.18.12.2936?journalCode=neco), Szita et al., 2006
19 | - [Deep Reinforcement Learning (MLSS lecture notes)](http://learning.mpi-sws.org/mlss2016/slides/2016-MLSS-RL.pdf), Schulman, 2016
20 | 


--------------------------------------------------------------------------------
/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | Please make sure that the boxes below are checked before you submit your issue. If your issue is an implementation question, please ask your question in the [Keras-RL Google group](https://groups.google.com/forum/#!forum/keras-rl-users) or [join the Keras-RL Gitter channel](https://gitter.im/keras-rl/Lobby) and ask there instead of filing a GitHub issue.
 2 | 
 3 | Thank you!
 4 | 
 5 | - [ ] Check that you are up-to-date with the master branch of Keras-RL. You can update with:
 6 | `pip install git+git://github.com/matthiasplappert/keras-rl.git --upgrade --no-deps`
 7 | 
 8 | - [ ] Check that you are up-to-date with the master branch of Keras. You can update with:
 9 | `pip install git+git://github.com/fchollet/keras.git --upgrade --no-deps`
10 | 
11 | - [ ] Provide a link to a GitHub Gist of a Python script that can reproduce your issue (or just copy the script here if it is short). If you report an error, please include the error message and the backtrace.
12 | 


--------------------------------------------------------------------------------
/docs/sources/agents/dqn.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | ---
 4 | 
 5 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/dqn.py#L89)</span>
 6 | ### DQNAgent
 7 | 
 8 | ```python
 9 | rl.agents.dqn.DQNAgent(model, policy=None, test_policy=None, enable_double_dqn=True, enable_dueling_network=False, dueling_type='avg')
10 | ```
11 | 
12 | Write me
13 | 
14 | 
15 | ---
16 | 
17 | ### References
18 | - [Playing Atari with Deep Reinforcement Learning](https://arxiv.org/abs/1312.5602), Mnih et al., 2013
19 | - [Human-level control through deep reinforcement learning](http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html), Mnih et al., 2015
20 | - [Deep Reinforcement Learning with Double Q-learning](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Applications_files/doubledqn.pdf), van Hasselt et al., 2015
21 | - [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581), Wang et al., 2016
22 | 


--------------------------------------------------------------------------------
/docs/templates/agents/overview.md:
--------------------------------------------------------------------------------
 1 | ## Available Agents
 2 | 
 3 | | Name                   | Implementation         | Observation Space  | Action Space   | 
 4 | | ---------------------- |------------------------| -------------------| ---------------|
 5 | | [DQN](/agents/dqn)     | `rl.agents.DQNAgent`   | discrete or continuous | discrete   | 
 6 | | [DDPG](/agents/ddpg)   | `rl.agents.DDPGAgent`  | discrete or continuous | continuous | 
 7 | | [NAF](/agents/naf)     | `rl.agents.NAFAgent`   | discrete or continuous | continuous |
 8 | | [CEM](/agents/cem)     | `rl.agents.CEMAgent`   | discrete or continuous | discrete   |
 9 | | [SARSA](/agents/sarsa) | `rl.agents.SARSAAgent` | discrete or continuous | discrete   | 
10 | 
11 | ---
12 | 
13 | ## Common API
14 | 
15 | All agents share a common API. This allows you to easily switch between different agents.
16 | That being said, keep in mind that some agents make assumptions regarding the action space, i.e. assume discrete
17 | or continuous actions.
18 | 
19 | {{autogenerated}}
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Matthias Plappert
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # OS X
 2 | .DS_Store
 3 | docs/site/*
 4 | 
 5 | # Ubuntu
 6 | *~
 7 | 
 8 | # PyCharm
 9 | .idea
10 | 
11 | # Byte-compiled / optimized / DLL files
12 | __pycache__/
13 | *.py[cod]
14 | 
15 | # C extensions
16 | *.so
17 | 
18 | # Distribution / packaging
19 | .Python
20 | env/
21 | build/
22 | download/
23 | bin/
24 | develop-eggs/
25 | dist/
26 | downloads/
27 | eggs/
28 | .eggs/
29 | lib64/
30 | parts/
31 | sdist/
32 | include/
33 | lib/
34 | man/
35 | local/
36 | var/
37 | share/
38 | pip-selfcheck.json
39 | *.egg-info/
40 | .installed.cfg
41 | *.egg
42 | 
43 | # PyInstaller
44 | #  Usually these files are written by a python script from a template
45 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
46 | *.manifest
47 | *.spec
48 | 
49 | # Installer logs
50 | pip-log.txt
51 | pip-delete-this-directory.txt
52 | 
53 | # Unit test / coverage reports
54 | htmlcov/
55 | .tox/
56 | .coverage
57 | .coverage.*
58 | .cache
59 | nosetests.xml
60 | coverage.xml
61 | *,cover
62 | 
63 | # Translations
64 | *.mo
65 | *.pot
66 | 
67 | # Django stuff:
68 | *.log
69 | 
70 | # Sphinx documentation
71 | docs/_build/
72 | 
73 | # PyBuilder
74 | target/
75 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | # Configuration of py.test
 2 | [pytest]
 3 | addopts=-v
 4 |         -n 2
 5 |         --durations=10
 6 |         --cov-report term-missing
 7 |         --cov=rl
 8 | 
 9 | # Do not run tests in the build folder or in the virtualenv folder `venv`.
10 | norecursedirs=build venv
11 | 
12 | # PEP-8 The following are ignored:
13 | # E251 unexpected spaces around keyword / parameter equals
14 | # E225 missing whitespace around operator
15 | # E226 missing whitespace around arithmetic operator
16 | # W291 trailing whitespace
17 | # W293 blank line contains whitespace
18 | # E501 line too long (82 > 79 characters)
19 | # E402 module level import not at top of file - temporary measure to coninue adding ros python packaged in sys.path
20 | # E731 do not assign a lambda expression, use a def
21 | # E302 two blank lines between the functions
22 | # E231 missing whitespace after ,
23 | # E241 multiple spaces after ','
24 | # E261 at least two spaces before inline comment
25 | 
26 | 
27 | pep8ignore=* E251 \
28 |            * E225 \
29 |            * E226 \
30 |            * W291 \
31 |            * W293 \
32 |            * E501 \
33 |            * E402 \
34 |            * E731 \
35 |            * E302 \
36 |            * E231 \
37 |            * E241 \
38 |            * E261
39 | 


--------------------------------------------------------------------------------
/tests/rl/agents/test_cem.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import absolute_import
 3 | 
 4 | import pytest
 5 | import numpy as np
 6 | from numpy.testing import assert_allclose
 7 | 
 8 | from keras.models import Model, Sequential
 9 | from keras.layers import Input, merge, Dense, Flatten
10 | 
11 | from rl.agents.cem import CEMAgent
12 | from rl.memory import EpisodeParameterMemory
13 | from rl.processors import MultiInputProcessor
14 | 
15 | from ..util import MultiInputTestEnv
16 | 
17 | 
18 | def test_single_cem_input():
19 |     model = Sequential()
20 |     model.add(Flatten(input_shape=(2, 3)))
21 |     model.add(Dense(2))
22 | 
23 |     memory = EpisodeParameterMemory(limit=10, window_length=2)
24 |     agent = CEMAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, train_interval=50)
25 |     agent.compile()
26 |     agent.fit(MultiInputTestEnv((3,)), nb_steps=100)
27 | 
28 | 
29 | def test_multi_cem_input():
30 |     input1 = Input(shape=(2, 3))
31 |     input2 = Input(shape=(2, 4))
32 |     x = merge([input1, input2], mode='concat')
33 |     x = Flatten()(x)
34 |     x = Dense(2)(x)
35 |     model = Model(input=[input1, input2], output=x)
36 | 
37 |     memory = EpisodeParameterMemory(limit=10, window_length=2)
38 |     processor = MultiInputProcessor(nb_inputs=2)
39 |     agent = CEMAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4,
40 |                      processor=processor, train_interval=50)
41 |     agent.compile()
42 |     agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=100)
43 | 


--------------------------------------------------------------------------------
/examples/sarsa_cartpole.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | 
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense, Activation, Flatten
 6 | from keras.optimizers import Adam
 7 | 
 8 | from rl.agents import SARSAAgent
 9 | from rl.policy import BoltzmannQPolicy
10 | 
11 | 
12 | ENV_NAME = 'CartPole-v0'
13 | 
14 | # Get the environment and extract the number of actions.
15 | env = gym.make(ENV_NAME)
16 | np.random.seed(123)
17 | env.seed(123)
18 | nb_actions = env.action_space.n
19 | 
20 | # Next, we build a very simple model.
21 | model = Sequential()
22 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
23 | model.add(Dense(16))
24 | model.add(Activation('relu'))
25 | model.add(Dense(16))
26 | model.add(Activation('relu'))
27 | model.add(Dense(16))
28 | model.add(Activation('relu'))
29 | model.add(Dense(nb_actions))
30 | model.add(Activation('linear'))
31 | print(model.summary())
32 | 
33 | # SARSA does not require a memory.
34 | policy = BoltzmannQPolicy()
35 | sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy)
36 | sarsa.compile(Adam(lr=1e-3), metrics=['mae'])
37 | 
38 | # Okay, now it's time to learn something! We visualize the training here for show, but this
39 | # slows down training quite a lot. You can always safely abort the training prematurely using
40 | # Ctrl + C.
41 | sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2)
42 | 
43 | # After training is done, we save the final weights.
44 | sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
45 | 
46 | # Finally, evaluate our algorithm for 5 episodes.
47 | sarsa.test(env, nb_episodes=5, visualize=True)
48 | 


--------------------------------------------------------------------------------
/examples/visualize_log.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | 
 7 | def visualize_log(filename, figsize=None, output=None):
 8 |     with open(filename, 'r') as f:
 9 |         data = json.load(f)
10 |     if 'episode' not in data:
11 |         raise ValueError('Log file "{}" does not contain the "episode" key.'.format(filename))
12 |     episodes = data['episode']
13 | 
14 |     # Get value keys. The x axis is shared and is the number of episodes.
15 |     keys = sorted(list(set(data.keys()).difference(set(['episode']))))
16 | 
17 |     if figsize is None:
18 |         figsize = (15., 5. * len(keys))
19 |     f, axarr = plt.subplots(len(keys), sharex=True, figsize=figsize)
20 |     for idx, key in enumerate(keys):
21 |         axarr[idx].plot(episodes, data[key])
22 |         axarr[idx].set_ylabel(key)
23 |     plt.xlabel('episodes')
24 |     plt.tight_layout()
25 |     if output is None:
26 |         plt.show()
27 |     else:
28 |         plt.savefig(output)
29 | 
30 | 
31 | parser = argparse.ArgumentParser()
32 | parser.add_argument('filename', type=str, help='The filename of the JSON log generated during training.')
33 | parser.add_argument('--output', type=str, default=None, help='The output file. If not specified, the log will only be displayed.')
34 | parser.add_argument('--figsize', nargs=2, type=float, default=None, help='The size of the figure in `width height` format specified in points.')
35 | args = parser.parse_args()
36 | 
37 | # You can use visualize_log to easily view the stats that were recorded during training. Simply
38 | # provide the filename of the `FileLogger` that was used in `FileLogger`.
39 | visualize_log(args.filename, output=args.output, figsize=args.figsize)
40 | 


--------------------------------------------------------------------------------
/examples/dqn_cartpole.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | 
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense, Activation, Flatten
 6 | from keras.optimizers import Adam
 7 | 
 8 | from rl.agents.dqn import DQNAgent
 9 | from rl.policy import BoltzmannQPolicy
10 | from rl.memory import SequentialMemory
11 | 
12 | 
13 | ENV_NAME = 'CartPole-v0'
14 | 
15 | 
16 | # Get the environment and extract the number of actions.
17 | env = gym.make(ENV_NAME)
18 | np.random.seed(123)
19 | env.seed(123)
20 | nb_actions = env.action_space.n
21 | 
22 | # Next, we build a very simple model.
23 | model = Sequential()
24 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
25 | model.add(Dense(16))
26 | model.add(Activation('relu'))
27 | model.add(Dense(16))
28 | model.add(Activation('relu'))
29 | model.add(Dense(16))
30 | model.add(Activation('relu'))
31 | model.add(Dense(nb_actions))
32 | model.add(Activation('linear'))
33 | print(model.summary())
34 | 
35 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
36 | # even the metrics!
37 | memory = SequentialMemory(limit=50000, window_length=1)
38 | policy = BoltzmannQPolicy()
39 | dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
40 |                target_model_update=1e-2, policy=policy)
41 | dqn.compile(Adam(lr=1e-3), metrics=['mae'])
42 | 
43 | # Okay, now it's time to learn something! We visualize the training here for show, but this
44 | # slows down training quite a lot. You can always safely abort the training prematurely using
45 | # Ctrl + C.
46 | dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)
47 | 
48 | # After training is done, we save the final weights.
49 | dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
50 | 
51 | # Finally, evaluate our algorithm for 5 episodes.
52 | dqn.test(env, nb_episodes=5, visualize=True)
53 | 


--------------------------------------------------------------------------------
/docs/sources/processors.md:
--------------------------------------------------------------------------------
 1 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/processors.py#L7)</span>
 2 | ### MultiInputProcessor
 3 | 
 4 | ```python
 5 | rl.processors.MultiInputProcessor(nb_inputs)
 6 | ```
 7 | 
 8 | Converts observations from an environment with multiple observations for use in a neural network
 9 | policy.
10 | 
11 | In some cases, you have environments that return multiple different observations per timestep 
12 | (in a robotics context, for example, a camera may be used to view the scene and a joint encoder may
13 | be used to report the angles for each joint). Usually, this can be handled by a policy that has
14 | multiple inputs, one for each modality. However, observations are returned by the environment
15 | in the form of a tuple `[(modality1_t, modality2_t, ..., modalityn_t) for t in T]` but the neural network
16 | expects them in per-modality batches like so: `[[modality1_1, ..., modality1_T], ..., [[modalityn_1, ..., modalityn_T]]`.
17 | This processor converts observations appropriate for this use case.
18 | 
19 | __Arguments__
20 | 
21 | - __nb_inputs__ (integer): The number of inputs, that is different modalities, to be used.
22 | 	Your neural network that you use for the policy must have a corresponding number of
23 | 	inputs.
24 | 
25 | ----
26 | 
27 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/processors.py#L40)</span>
28 | ### WhiteningNormalizerProcessor
29 | 
30 | ```python
31 | rl.processors.WhiteningNormalizerProcessor()
32 | ```
33 | 
34 | Normalizes the observations to have zero mean and standard deviation of one,
35 | i.e. it applies whitening to the inputs.
36 | 
37 | This typically helps significantly with learning, especially if different dimensions are
38 | on different scales. However, it complicates training in the sense that you will have to store
39 | these weights alongside the policy if you intend to load it later. It is the responsibility of
40 | the user to do so.
41 | 
42 | 


--------------------------------------------------------------------------------
/examples/cem_cartpole.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | 
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense, Activation, Flatten
 6 | from keras.optimizers import Adam
 7 | 
 8 | from rl.agents.cem import CEMAgent
 9 | from rl.memory import EpisodeParameterMemory
10 | 
11 | ENV_NAME = 'CartPole-v0'
12 | 
13 | 
14 | # Get the environment and extract the number of actions.
15 | env = gym.make(ENV_NAME)
16 | np.random.seed(123)
17 | env.seed(123)
18 | 
19 | nb_actions = env.action_space.n
20 | obs_dim = env.observation_space.shape[0]
21 | 
22 | # Option 1 : Simple model
23 | model = Sequential()
24 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
25 | model.add(Dense(nb_actions))
26 | model.add(Activation('softmax'))
27 | 
28 | # Option 2: deep network
29 | # model = Sequential()
30 | # model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
31 | # model.add(Dense(16))
32 | # model.add(Activation('relu'))
33 | # model.add(Dense(16))
34 | # model.add(Activation('relu'))
35 | # model.add(Dense(16))
36 | # model.add(Activation('relu'))
37 | # model.add(Dense(nb_actions))
38 | # model.add(Activation('softmax'))
39 | 
40 | 
41 | print(model.summary())
42 | 
43 | 
44 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
45 | # even the metrics!
46 | memory = EpisodeParameterMemory(limit=1000, window_length=1)
47 | 
48 | cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
49 |                batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
50 | cem.compile()
51 | 
52 | # Okay, now it's time to learn something! We visualize the training here for show, but this
53 | # slows down training quite a lot. You can always safely abort the training prematurely using
54 | # Ctrl + C.
55 | cem.fit(env, nb_steps=100000, visualize=False, verbose=2)
56 | 
57 | # After training is done, we save the best weights.
58 | cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True)
59 | 
60 | # Finally, evaluate our algorithm for 5 episodes.
61 | cem.test(env, nb_episodes=5, visualize=True)
62 | 


--------------------------------------------------------------------------------
/examples/duel_dqn_cartpole.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | 
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense, Activation, Flatten
 6 | from keras.optimizers import Adam
 7 | 
 8 | from rl.agents.dqn import DQNAgent
 9 | from rl.policy import BoltzmannQPolicy
10 | from rl.memory import SequentialMemory
11 | 
12 | 
13 | ENV_NAME = 'CartPole-v0'
14 | 
15 | 
16 | # Get the environment and extract the number of actions.
17 | env = gym.make(ENV_NAME)
18 | np.random.seed(123)
19 | env.seed(123)
20 | nb_actions = env.action_space.n
21 | 
22 | # Next, we build a very simple model regardless of the dueling architecture
23 | # if you enable dueling network in DQN , DQN will build a dueling network base on your model automatically
24 | # Also, you can build a dueling network by yourself and turn off the dueling network in DQN.
25 | model = Sequential()
26 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
27 | model.add(Dense(16))
28 | model.add(Activation('relu'))
29 | model.add(Dense(16))
30 | model.add(Activation('relu'))
31 | model.add(Dense(16))
32 | model.add(Activation('relu'))
33 | model.add(Dense(nb_actions, activation='linear'))
34 | print(model.summary())
35 | 
36 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
37 | # even the metrics!
38 | memory = SequentialMemory(limit=50000, window_length=1)
39 | policy = BoltzmannQPolicy()
40 | # enable the dueling network
41 | # you can specify the dueling_type to one of {'avg','max','naive'}
42 | dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
43 |                enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy)
44 | dqn.compile(Adam(lr=1e-3), metrics=['mae'])
45 | 
46 | # Okay, now it's time to learn something! We visualize the training here for show, but this
47 | # slows down training quite a lot. You can always safely abort the training prematurely using
48 | # Ctrl + C.
49 | dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)
50 | 
51 | # After training is done, we save the final weights.
52 | dqn.save_weights('duel_dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
53 | 
54 | # Finally, evaluate our algorithm for 5 episodes.
55 | dqn.test(env, nb_episodes=5, visualize=False)
56 | 


--------------------------------------------------------------------------------
/rl/random.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import numpy as np
 3 | 
 4 | 
 5 | class RandomProcess(object):
 6 |     def reset_states(self):
 7 |         pass
 8 | 
 9 | 
10 | class AnnealedGaussianProcess(RandomProcess):
11 |     def __init__(self, mu, sigma, sigma_min, n_steps_annealing):
12 |         self.mu = mu
13 |         self.sigma = sigma
14 |         self.n_steps = 0
15 | 
16 |         if sigma_min is not None:
17 |             self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
18 |             self.c = sigma
19 |             self.sigma_min = sigma_min
20 |         else:
21 |             self.m = 0.
22 |             self.c = sigma
23 |             self.sigma_min = sigma
24 | 
25 |     @property
26 |     def current_sigma(self):
27 |         sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
28 |         return sigma
29 | 
30 | 
31 | class GaussianWhiteNoiseProcess(AnnealedGaussianProcess):
32 |     def __init__(self, mu=0., sigma=1., sigma_min=None, n_steps_annealing=1000, size=1):
33 |         super(GaussianWhiteNoiseProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
34 |         self.size = size
35 | 
36 |     def sample(self):
37 |         sample = np.random.normal(self.mu, self.current_sigma, self.size)
38 |         self.n_steps += 1
39 |         return sample
40 | 
41 | # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
42 | class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
43 |     def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000):
44 |         super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
45 |         self.theta = theta
46 |         self.mu = mu
47 |         self.dt = dt
48 |         self.x0 = x0
49 |         self.size = size
50 |         self.reset_states()
51 | 
52 |     def sample(self):
53 |         x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
54 |         self.x_prev = x
55 |         self.n_steps += 1
56 |         return x
57 | 
58 |     def reset_states(self):
59 |         self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)
60 | 


--------------------------------------------------------------------------------
/docs/sources/core.md:
--------------------------------------------------------------------------------
 1 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L11)</span>
 2 | ### Agent
 3 | 
 4 | ```python
 5 | rl.core.Agent(processor=None)
 6 | ```
 7 | 
 8 | Abstract base class for all implemented agents.
 9 | 
10 | Each agent interacts with the environment (as defined by the `Env` class) by first observing the
11 | state of the environment. Based on this observation the agent changes the environment by performing
12 | an action.
13 | 
14 | Do not use this abstract base class directly but instead use one of the concrete agents implemented.
15 | Each agent realizes a reinforcement learning algorithm. Since all agents conform to the same
16 | interface, you can use them interchangeably.
17 | 
18 | To implement your own agent, you have to implement the following methods:
19 | 
20 | - `forward`
21 | - `backward`
22 | - `compile`
23 | - `load_weights`
24 | - `save_weights`
25 | - `layers`
26 | 
27 | __Arguments__
28 | 
29 | - __processor__ (`Processor` instance): See [Processor](#processor) for details.
30 | 
31 | ----
32 | 
33 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L454)</span>
34 | ### Processor
35 | 
36 | ```python
37 | rl.core.Processor()
38 | ```
39 | 
40 | Abstract base class for implementing processors.
41 | 
42 | A processor acts as a coupling mechanism between an `Agent` and its `Env`. This can
43 | be necessary if your agent has different requirements with respect to the form of the
44 | observations, actions, and rewards of the environment. By implementing a custom processor,
45 | you can effectively translate between the two without having to change the underlaying
46 | implementation of the agent or environment.
47 | 
48 | Do not use this abstract base class directly but instead use one of the concrete implementations
49 | or write your own.
50 | 
51 | ----
52 | 
53 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L533)</span>
54 | ### Env
55 | 
56 | ```python
57 | rl.core.Env()
58 | ```
59 | 
60 | The abstract environment class that is used by all agents. This class has the exact
61 | same API that OpenAI Gym uses so that integrating with it is trivial. In contrast to the
62 | OpenAI Gym implementation, this class only defines the abstract methods without any actual
63 | implementation.
64 | 
65 | ----
66 | 
67 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L609)</span>
68 | ### Space
69 | 
70 | ```python
71 | rl.core.Space()
72 | ```
73 | 
74 | Abstract model for a space that is used for the state and action spaces. This class has the
75 | exact same API that OpenAI Gym uses so that integrating with it is trivial.
76 | 
77 | 


--------------------------------------------------------------------------------
/rl/processors.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from rl.core import Processor
 4 | from rl.util import WhiteningNormalizer
 5 | 
 6 | 
 7 | class MultiInputProcessor(Processor):
 8 |     """Converts observations from an environment with multiple observations for use in a neural network
 9 |     policy.
10 | 
11 |     In some cases, you have environments that return multiple different observations per timestep 
12 |     (in a robotics context, for example, a camera may be used to view the scene and a joint encoder may
13 |     be used to report the angles for each joint). Usually, this can be handled by a policy that has
14 |     multiple inputs, one for each modality. However, observations are returned by the environment
15 |     in the form of a tuple `[(modality1_t, modality2_t, ..., modalityn_t) for t in T]` but the neural network
16 |     expects them in per-modality batches like so: `[[modality1_1, ..., modality1_T], ..., [[modalityn_1, ..., modalityn_T]]`.
17 |     This processor converts observations appropriate for this use case.
18 | 
19 |     # Arguments
20 |         nb_inputs (integer): The number of inputs, that is different modalities, to be used.
21 |             Your neural network that you use for the policy must have a corresponding number of
22 |             inputs.
23 |     """
24 |     def __init__(self, nb_inputs):
25 |         self.nb_inputs = nb_inputs
26 | 
27 |     def process_state_batch(self, state_batch):
28 |         input_batches = [[] for x in range(self.nb_inputs)]
29 |         for state in state_batch:
30 |             processed_state = [[] for x in range(self.nb_inputs)]
31 |             for observation in state:
32 |                 assert len(observation) == self.nb_inputs
33 |                 for o, s in zip(observation, processed_state):
34 |                     s.append(o)
35 |             for idx, s in enumerate(processed_state):
36 |                 input_batches[idx].append(s)
37 |         return [np.array(x) for x in input_batches]
38 | 
39 | 
40 | class WhiteningNormalizerProcessor(Processor):
41 |     """Normalizes the observations to have zero mean and standard deviation of one,
42 |     i.e. it applies whitening to the inputs.
43 | 
44 |     This typically helps significantly with learning, especially if different dimensions are
45 |     on different scales. However, it complicates training in the sense that you will have to store
46 |     these weights alongside the policy if you intend to load it later. It is the responsibility of
47 |     the user to do so.
48 |     """
49 |     def __init__(self):
50 |         self.normalizer = None
51 | 
52 |     def process_state_batch(self, batch):
53 |         if self.normalizer is None:
54 |             self.normalizer = WhiteningNormalizer(shape=batch.shape[1:], dtype=batch.dtype)
55 |         self.normalizer.update(batch)
56 |         return self.normalizer.normalize(batch)
57 | 


--------------------------------------------------------------------------------
/examples/ddpg_pendulum.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | 
 4 | from keras.models import Sequential, Model
 5 | from keras.layers import Dense, Activation, Flatten, Input, merge
 6 | from keras.optimizers import Adam
 7 | 
 8 | from rl.agents import DDPGAgent
 9 | from rl.memory import SequentialMemory
10 | from rl.random import OrnsteinUhlenbeckProcess
11 | 
12 | 
13 | ENV_NAME = 'Pendulum-v0'
14 | gym.undo_logger_setup()
15 | 
16 | 
17 | # Get the environment and extract the number of actions.
18 | env = gym.make(ENV_NAME)
19 | np.random.seed(123)
20 | env.seed(123)
21 | assert len(env.action_space.shape) == 1
22 | nb_actions = env.action_space.shape[0]
23 | 
24 | # Next, we build a very simple model.
25 | actor = Sequential()
26 | actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
27 | actor.add(Dense(16))
28 | actor.add(Activation('relu'))
29 | actor.add(Dense(16))
30 | actor.add(Activation('relu'))
31 | actor.add(Dense(16))
32 | actor.add(Activation('relu'))
33 | actor.add(Dense(nb_actions))
34 | actor.add(Activation('linear'))
35 | print(actor.summary())
36 | 
37 | action_input = Input(shape=(nb_actions,), name='action_input')
38 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
39 | flattened_observation = Flatten()(observation_input)
40 | x = merge([action_input, flattened_observation], mode='concat')
41 | x = Dense(32)(x)
42 | x = Activation('relu')(x)
43 | x = Dense(32)(x)
44 | x = Activation('relu')(x)
45 | x = Dense(32)(x)
46 | x = Activation('relu')(x)
47 | x = Dense(1)(x)
48 | x = Activation('linear')(x)
49 | critic = Model(input=[action_input, observation_input], output=x)
50 | print(critic.summary())
51 | 
52 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
53 | # even the metrics!
54 | memory = SequentialMemory(limit=100000, window_length=1)
55 | random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
56 | agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
57 |                   memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
58 |                   random_process=random_process, gamma=.99, target_model_update=1e-3)
59 | agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
60 | 
61 | # Okay, now it's time to learn something! We visualize the training here for show, but this
62 | # slows down training quite a lot. You can always safely abort the training prematurely using
63 | # Ctrl + C.
64 | agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=200)
65 | 
66 | # After training is done, we save the final weights.
67 | agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
68 | 
69 | # Finally, evaluate our algorithm for 5 episodes.
70 | agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
71 | 


--------------------------------------------------------------------------------
/tests/rl/agents/test_ddpg.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import absolute_import
 3 | 
 4 | import pytest
 5 | import numpy as np
 6 | from numpy.testing import assert_allclose
 7 | 
 8 | from keras.models import Model, Sequential
 9 | from keras.layers import Input, merge, Dense, Flatten
10 | 
11 | from rl.agents.ddpg import DDPGAgent
12 | from rl.memory import SequentialMemory
13 | from rl.processors import MultiInputProcessor
14 | 
15 | from ..util import MultiInputTestEnv
16 | 
17 | 
18 | def test_single_ddpg_input():
19 |     nb_actions = 2
20 | 
21 |     actor = Sequential()
22 |     actor.add(Flatten(input_shape=(2, 3)))
23 |     actor.add(Dense(nb_actions))
24 | 
25 |     action_input = Input(shape=(nb_actions,), name='action_input')
26 |     observation_input = Input(shape=(2, 3), name='observation_input')
27 |     x = merge([action_input, Flatten()(observation_input)], mode='concat')
28 |     x = Dense(1)(x)
29 |     critic = Model(input=[action_input, observation_input], output=x)
30 | 
31 |     memory = SequentialMemory(limit=10, window_length=2)
32 |     agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=action_input, memory=memory,
33 |                       nb_actions=2, nb_steps_warmup_critic=5, nb_steps_warmup_actor=5, batch_size=4)
34 |     agent.compile('sgd')
35 |     agent.fit(MultiInputTestEnv((3,)), nb_steps=10)
36 | 
37 | 
38 | def test_multi_ddpg_input():
39 |     nb_actions = 2
40 | 
41 |     actor_observation_input1 = Input(shape=(2, 3), name='actor_observation_input1')
42 |     actor_observation_input2 = Input(shape=(2, 4), name='actor_observation_input2')
43 |     actor = Sequential()
44 |     x = merge([actor_observation_input1, actor_observation_input2], mode='concat')
45 |     x = Flatten()(x)
46 |     x = Dense(nb_actions)(x)
47 |     actor = Model(input=[actor_observation_input1, actor_observation_input2], output=x)
48 | 
49 |     action_input = Input(shape=(nb_actions,), name='action_input')
50 |     critic_observation_input1 = Input(shape=(2, 3), name='critic_observation_input1')
51 |     critic_observation_input2 = Input(shape=(2, 4), name='critic_observation_input2')
52 |     x = merge([critic_observation_input1, critic_observation_input2], mode='concat')
53 |     x = merge([action_input, Flatten()(x)], mode='concat')
54 |     x = Dense(1)(x)
55 |     critic = Model(input=[action_input, critic_observation_input1, critic_observation_input2], output=x)
56 | 
57 |     processor = MultiInputProcessor(nb_inputs=2)
58 |     memory = SequentialMemory(limit=10, window_length=2)
59 |     agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=action_input, memory=memory,
60 |                       nb_actions=2, nb_steps_warmup_critic=5, nb_steps_warmup_actor=5, batch_size=4,
61 |                       processor=processor)
62 |     agent.compile('sgd')
63 |     agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10)
64 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | dist: trusty
 3 | language: python
 4 | matrix:
 5 |     include:
 6 |         - python: 3.5
 7 |           env: KERAS_BACKEND=theano
 8 |         - python: 3.5
 9 |           env: KERAS_BACKEND=tensorflow
10 |         - python: 2.7
11 |           env: KERAS_BACKEND=theano
12 |         - python: 2.7
13 |           env: KERAS_BACKEND=tensorflow
14 |         - python: 2.7
15 |           env: KERAS_BACKEND=tensorflow LEGACY_KERAS=1
16 |         - python: 2.7
17 |           env: KERAS_BACKEND=tensorflow TEST_MODE=PEP8
18 |         - python: 2.7
19 |           env: KERAS_BACKEND=theano TEST_MODE=INTEGRATION
20 |         - python: 3.5
21 |           env: KERAS_BACKEND=theano TEST_MODE=INTEGRATION
22 |         - python: 2.7
23 |           env: KERAS_BACKEND=tensorflow TEST_MODE=INTEGRATION
24 |         - python: 3.5
25 |           env: KERAS_BACKEND=tensorflow TEST_MODE=INTEGRATION
26 | install:
27 |   # Adopted from https://github.com/fchollet/keras/blob/master/.travis.yml.
28 |   - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
29 |       wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh;
30 |     else
31 |       wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
32 |     fi
33 |   - bash miniconda.sh -b -p $HOME/miniconda
34 |   - export PATH="$HOME/miniconda/bin:$PATH"
35 |   - hash -r
36 |   - conda config --set always_yes yes --set changeps1 no
37 |   - conda update -q conda
38 |   # Useful for debugging any issues with conda
39 |   - conda info -a
40 | 
41 |   - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib pandas pytest h5py
42 |   - source activate test-environment
43 |   - pip install pytest-xdist
44 |   # See https://github.com/pytest-dev/pytest-cov/issues/124 for details
45 |   - pip install pytest-cov==2.2.1 python-coveralls coverage==3.7.1
46 |   - pip install pep8 pytest-pep8
47 |   - pip install tensorflow
48 |   # Bleeding-edge: pip install git+https://github.com/Theano/Theano.git
49 |   - pip install theano>=0.9.0rc1
50 |   - pip install gym
51 |   # Bleeding-edge: pip install git+https://github.com/fchollet/keras.git;
52 |   - if [[ "$LEGACY_KERAS" == "1" ]]; then
53 |       pip install keras==1.2.2;
54 |     else
55 |       pip install "keras<2.0.7";
56 |     fi
57 | 
58 |   - python setup.py install
59 | 
60 | # command to run tests.
61 | script:
62 |   # Run keras backend init to initialize backend config.
63 |   - python -c "import keras.backend"
64 |   # Set up keras backend
65 |   - sed -i -e 's/"backend":[[:space:]]*"[^"]*/"backend":\ "'$KERAS_BACKEND'/g' ~/.keras/keras.json;
66 |   - echo -e "Running tests with the following config:\n$(cat ~/.keras/keras.json)"
67 |   - if [[ "$TEST_MODE" == "INTEGRATION" ]]; then
68 |       PYTHONPATH=$PWD:$PYTHONPATH py.test tests/integration;
69 |     elif [[ "$TEST_MODE" == "PEP8" ]]; then
70 |       PYTHONPATH=$PWD:$PYTHONPATH py.test --pep8 -m pep8 -n0;
71 |     else
72 |       PYTHONPATH=$PWD:$PYTHONPATH py.test tests/;
73 |     fi
74 | after_success:
75 |   - coveralls
76 | 


--------------------------------------------------------------------------------
/examples/ddpg_mujoco.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import gym
 4 | from gym import wrappers
 5 | 
 6 | from keras.models import Sequential, Model
 7 | from keras.layers import Dense, Activation, Flatten, Input
 8 | from keras.optimizers import Adam
 9 | 
10 | from rl.processors import WhiteningNormalizerProcessor
11 | from rl.agents import DDPGAgent
12 | from rl.memory import SequentialMemory
13 | from rl.random import OrnsteinUhlenbeckProcess
14 | from rl.keras_future import concatenate
15 | 
16 | 
17 | class MujocoProcessor(WhiteningNormalizerProcessor):
18 |     def process_action(self, action):
19 |         return np.clip(action, -1., 1.)
20 | 
21 | 
22 | ENV_NAME = 'HalfCheetah-v1'
23 | gym.undo_logger_setup()
24 | 
25 | 
26 | # Get the environment and extract the number of actions.
27 | env = gym.make(ENV_NAME)
28 | env = wrappers.Monitor(env, '/tmp/{}'.format(ENV_NAME), force=True)
29 | np.random.seed(123)
30 | env.seed(123)
31 | assert len(env.action_space.shape) == 1
32 | nb_actions = env.action_space.shape[0]
33 | 
34 | # Next, we build a very simple model.
35 | actor = Sequential()
36 | actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
37 | actor.add(Dense(400))
38 | actor.add(Activation('relu'))
39 | actor.add(Dense(300))
40 | actor.add(Activation('relu'))
41 | actor.add(Dense(nb_actions))
42 | actor.add(Activation('tanh'))
43 | print(actor.summary())
44 | 
45 | action_input = Input(shape=(nb_actions,), name='action_input')
46 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
47 | flattened_observation = Flatten()(observation_input)
48 | x = Dense(400)(flattened_observation)
49 | x = Activation('relu')(x)
50 | x = concatenate([x, action_input])
51 | x = Dense(300)(x)
52 | x = Activation('relu')(x)
53 | x = Dense(1)(x)
54 | x = Activation('linear')(x)
55 | critic = Model(input=[action_input, observation_input], output=x)
56 | print(critic.summary())
57 | 
58 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
59 | # even the metrics!
60 | memory = SequentialMemory(limit=100000, window_length=1)
61 | random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1)
62 | agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
63 |                   memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
64 |                   random_process=random_process, gamma=.99, target_model_update=1e-3,
65 |                   processor=MujocoProcessor())
66 | agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae'])
67 | 
68 | # Okay, now it's time to learn something! We visualize the training here for show, but this
69 | # slows down training quite a lot. You can always safely abort the training prematurely using
70 | # Ctrl + C.
71 | agent.fit(env, nb_steps=1000000, visualize=False, verbose=1)
72 | 
73 | # After training is done, we save the final weights.
74 | agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
75 | 
76 | # Finally, evaluate our algorithm for 5 episodes.
77 | agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
78 | 


--------------------------------------------------------------------------------
/tests/rl/test_util.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import pytest
 3 | import numpy as np
 4 | from numpy.testing import assert_allclose
 5 | 
 6 | from keras.models import Model, Sequential
 7 | from keras.layers import Input, Dense, merge
 8 | from keras.optimizers import SGD
 9 | import keras.backend as K
10 | 
11 | from rl.util import clone_optimizer, clone_model, huber_loss, WhiteningNormalizer
12 | 
13 | 
14 | def test_clone_sequential_model():
15 |     seq = Sequential()
16 |     seq.add(Dense(8, input_shape=(3,)))
17 |     seq.compile(optimizer='sgd', loss='mse')
18 | 
19 |     clone = clone_model(seq)
20 |     clone.compile(optimizer='sgd', loss='mse')
21 | 
22 |     ins = np.random.random((4, 3))
23 |     y_pred_seq = seq.predict_on_batch(ins)
24 |     y_pred_clone = clone.predict_on_batch(ins)
25 |     assert y_pred_seq.shape == y_pred_clone.shape
26 |     assert_allclose(y_pred_seq, y_pred_clone)
27 | 
28 | 
29 | def test_clone_graph_model():
30 |     in1 = Input(shape=(2,))
31 |     in2 = Input(shape=(3,))
32 |     x = Dense(8)(merge([in1, in2], mode='concat'))
33 |     graph = Model([in1, in2], x)
34 |     graph.compile(optimizer='sgd', loss='mse')
35 | 
36 |     clone = clone_model(graph)
37 |     clone.compile(optimizer='sgd', loss='mse')
38 | 
39 |     ins = [np.random.random((4, 2)), np.random.random((4, 3))]
40 |     y_pred_graph = graph.predict_on_batch(ins)
41 |     y_pred_clone = clone.predict_on_batch(ins)
42 |     assert y_pred_graph.shape == y_pred_clone.shape
43 |     assert_allclose(y_pred_graph, y_pred_clone)
44 | 
45 | 
46 | def test_clone_optimizer():
47 |     lr, momentum, clipnorm, clipvalue = np.random.random(size=4)
48 |     optimizer = SGD(lr=lr, momentum=momentum, clipnorm=clipnorm, clipvalue=clipvalue)
49 |     clone = clone_optimizer(optimizer)
50 | 
51 |     assert isinstance(clone, SGD)
52 |     assert K.get_value(optimizer.lr) == K.get_value(clone.lr)
53 |     assert K.get_value(optimizer.momentum) == K.get_value(clone.momentum)
54 |     assert optimizer.clipnorm == clone.clipnorm
55 |     assert optimizer.clipvalue == clone.clipvalue
56 | 
57 | 
58 | def test_clone_optimizer_from_string():
59 |     clone = clone_optimizer('sgd')
60 |     assert isinstance(clone, SGD)
61 | 
62 | 
63 | def test_huber_loss():
64 |     a = np.array([1.,  1.5, 2., 4.])
65 |     b = np.array([1.5, 1.,  4., 2.])
66 |     assert_allclose(K.eval(huber_loss(a, b, 1.)), np.array([.125, .125, 1.5, 1.5]))
67 |     assert_allclose(K.eval(huber_loss(a, b, 3.)), np.array([.125, .125, 2., 2.]))
68 |     assert_allclose(K.eval(huber_loss(a, b, np.inf)), np.array([.125, .125, 2., 2.]))
69 | 
70 | 
71 | def test_whitening_normalizer():
72 |     x = np.random.normal(loc=.2, scale=2., size=(1000, 5))
73 |     normalizer = WhiteningNormalizer(shape=(5,))
74 |     normalizer.update(x[:500])
75 |     normalizer.update(x[500:])
76 | 
77 |     assert_allclose(normalizer.mean, np.mean(x, axis=0))
78 |     assert_allclose(normalizer.std, np.std(x, axis=0))
79 |     
80 |     x_norm = normalizer.normalize(x)
81 |     assert_allclose(np.mean(x_norm, axis=0), np.zeros(5, dtype=normalizer.dtype), atol=1e-5)
82 |     assert_allclose(np.std(x_norm, axis=0), np.ones(5, dtype=normalizer.dtype), atol=1e-5)
83 | 
84 |     x_denorm = normalizer.denormalize(x_norm)
85 |     assert_allclose(x_denorm, x)
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     pytest.main([__file__])
90 | 


--------------------------------------------------------------------------------
/examples/naf_pendulum.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | 
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense, Activation, Flatten, Input
 6 | from keras.optimizers import Adam
 7 | 
 8 | from rl.agents import NAFAgent
 9 | from rl.memory import SequentialMemory
10 | from rl.random import OrnsteinUhlenbeckProcess
11 | from rl.core import Processor
12 | from rl.keras_future import concatenate, Model
13 | 
14 | class PendulumProcessor(Processor):
15 |     def process_reward(self, reward):
16 |         # The magnitude of the reward can be important. Since each step yields a relatively
17 |         # high reward, we reduce the magnitude by two orders.
18 |         return reward / 100.
19 | 
20 | 
21 | ENV_NAME = 'Pendulum-v0'
22 | gym.undo_logger_setup()
23 | 
24 | 
25 | # Get the environment and extract the number of actions.
26 | env = gym.make(ENV_NAME)
27 | np.random.seed(123)
28 | env.seed(123)
29 | assert len(env.action_space.shape) == 1
30 | nb_actions = env.action_space.shape[0]
31 | 
32 | # Build all necessary models: V, mu, and L networks.
33 | V_model = Sequential()
34 | V_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
35 | V_model.add(Dense(16))
36 | V_model.add(Activation('relu'))
37 | V_model.add(Dense(16))
38 | V_model.add(Activation('relu'))
39 | V_model.add(Dense(16))
40 | V_model.add(Activation('relu'))
41 | V_model.add(Dense(1))
42 | V_model.add(Activation('linear'))
43 | print(V_model.summary())
44 | 
45 | mu_model = Sequential()
46 | mu_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
47 | mu_model.add(Dense(16))
48 | mu_model.add(Activation('relu'))
49 | mu_model.add(Dense(16))
50 | mu_model.add(Activation('relu'))
51 | mu_model.add(Dense(16))
52 | mu_model.add(Activation('relu'))
53 | mu_model.add(Dense(nb_actions))
54 | mu_model.add(Activation('linear'))
55 | print(mu_model.summary())
56 | 
57 | action_input = Input(shape=(nb_actions,), name='action_input')
58 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
59 | x = concatenate([action_input, Flatten()(observation_input)])
60 | x = Dense(32)(x)
61 | x = Activation('relu')(x)
62 | x = Dense(32)(x)
63 | x = Activation('relu')(x)
64 | x = Dense(32)(x)
65 | x = Activation('relu')(x)
66 | x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x)
67 | x = Activation('linear')(x)
68 | L_model = Model(input=[action_input, observation_input], output=x)
69 | print(L_model.summary())
70 | 
71 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
72 | # even the metrics!
73 | processor = PendulumProcessor()
74 | memory = SequentialMemory(limit=100000, window_length=1)
75 | random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions)
76 | agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model,
77 |                  memory=memory, nb_steps_warmup=100, random_process=random_process,
78 |                  gamma=.99, target_model_update=1e-3, processor=processor)
79 | agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
80 | 
81 | # Okay, now it's time to learn something! We visualize the training here for show, but this
82 | # slows down training quite a lot. You can always safely abort the training prematurely using
83 | # Ctrl + C.
84 | agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=200)
85 | 
86 | # After training is done, we save the final weights.
87 | agent.save_weights('cdqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
88 | 
89 | # Finally, evaluate our algorithm for 5 episodes.
90 | agent.test(env, nb_episodes=10, visualize=True, nb_max_episode_steps=200)
91 | 


--------------------------------------------------------------------------------
/tests/integration/test_continuous.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import numpy as np
 4 | import gym
 5 | 
 6 | from keras.models import Sequential
 7 | from keras.layers import Dense, Activation, Flatten, Input
 8 | from keras.optimizers import Adam
 9 | 
10 | from rl.agents import NAFAgent, DDPGAgent
11 | from rl.random import OrnsteinUhlenbeckProcess
12 | from rl.memory import SequentialMemory
13 | from rl.keras_future import Model, concatenate
14 | 
15 | 
16 | def test_cdqn():
17 |     # TODO: replace this with a simpler environment where we can actually test if it finds a solution
18 |     env = gym.make('Pendulum-v0')
19 |     np.random.seed(123)
20 |     env.seed(123)
21 |     random.seed(123)
22 |     nb_actions = env.action_space.shape[0]
23 | 
24 |     V_model = Sequential()
25 |     V_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
26 |     V_model.add(Dense(16))
27 |     V_model.add(Activation('relu'))
28 |     V_model.add(Dense(1))
29 | 
30 |     mu_model = Sequential()
31 |     mu_model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
32 |     mu_model.add(Dense(16))
33 |     mu_model.add(Activation('relu'))
34 |     mu_model.add(Dense(nb_actions))
35 |     
36 |     action_input = Input(shape=(nb_actions,), name='action_input')
37 |     observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
38 |     x = concatenate([action_input, Flatten()(observation_input)])
39 |     x = Dense(16)(x)
40 |     x = Activation('relu')(x)
41 |     x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x)
42 |     L_model = Model(input=[action_input, observation_input], output=x)
43 | 
44 |     memory = SequentialMemory(limit=1000, window_length=1)
45 |     random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions)
46 |     agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model,
47 |                      memory=memory, nb_steps_warmup=50, random_process=random_process,
48 |                      gamma=.99, target_model_update=1e-3)
49 |     agent.compile(Adam(lr=1e-3))
50 | 
51 |     agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100)
52 |     h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100)
53 |     # TODO: evaluate history
54 | 
55 | 
56 | def test_ddpg():
57 |     # TODO: replace this with a simpler environment where we can actually test if it finds a solution
58 |     env = gym.make('Pendulum-v0')
59 |     np.random.seed(123)
60 |     env.seed(123)
61 |     random.seed(123)
62 |     nb_actions = env.action_space.shape[0]
63 | 
64 |     actor = Sequential()
65 |     actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
66 |     actor.add(Dense(16))
67 |     actor.add(Activation('relu'))
68 |     actor.add(Dense(nb_actions))
69 |     actor.add(Activation('linear'))
70 | 
71 |     action_input = Input(shape=(nb_actions,), name='action_input')
72 |     observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
73 |     flattened_observation = Flatten()(observation_input)
74 |     x = concatenate([action_input, flattened_observation])
75 |     x = Dense(16)(x)
76 |     x = Activation('relu')(x)
77 |     x = Dense(1)(x)
78 |     x = Activation('linear')(x)
79 |     critic = Model(input=[action_input, observation_input], output=x)
80 |     
81 |     memory = SequentialMemory(limit=1000, window_length=1)
82 |     random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3)
83 |     agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
84 |                       memory=memory, nb_steps_warmup_critic=50, nb_steps_warmup_actor=50,
85 |                       random_process=random_process, gamma=.99, target_model_update=1e-3)
86 |     agent.compile([Adam(lr=1e-3), Adam(lr=1e-3)])
87 | 
88 |     agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100)
89 |     h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100)
90 |     # TODO: evaluate history
91 | 


--------------------------------------------------------------------------------
/rl/policy.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import numpy as np
  3 | 
  4 | from rl.util import *
  5 | 
  6 | 
  7 | class Policy(object):
  8 |     def _set_agent(self, agent):
  9 |         self.agent = agent
 10 | 
 11 |     @property
 12 |     def metrics_names(self):
 13 |         return []
 14 | 
 15 |     @property
 16 |     def metrics(self):
 17 |         return []
 18 | 
 19 |     def select_action(self, **kwargs):
 20 |         raise NotImplementedError()
 21 | 
 22 |     def get_config(self):
 23 |         return {}
 24 | 
 25 | 
 26 | class LinearAnnealedPolicy(Policy):
 27 |     def __init__(self, inner_policy, attr, value_max, value_min, value_test, nb_steps):
 28 |         if not hasattr(inner_policy, attr):
 29 |             raise ValueError('Policy "{}" does not have attribute "{}".'.format(attr))
 30 | 
 31 |         super(LinearAnnealedPolicy, self).__init__()
 32 | 
 33 |         self.inner_policy = inner_policy
 34 |         self.attr = attr
 35 |         self.value_max = value_max
 36 |         self.value_min = value_min
 37 |         self.value_test = value_test
 38 |         self.nb_steps = nb_steps
 39 | 
 40 |     def get_current_value(self):
 41 |         if self.agent.training:
 42 |             # Linear annealed: f(x) = ax + b.
 43 |             a = -float(self.value_max - self.value_min) / float(self.nb_steps)
 44 |             b = float(self.value_max)
 45 |             value = max(self.value_min, a * float(self.agent.step) + b)
 46 |         else:
 47 |             value = self.value_test
 48 |         return value
 49 | 
 50 |     def select_action(self, **kwargs):
 51 |         setattr(self.inner_policy, self.attr, self.get_current_value())
 52 |         return self.inner_policy.select_action(**kwargs)
 53 | 
 54 |     @property
 55 |     def metrics_names(self):
 56 |         return ['mean_{}'.format(self.attr)]
 57 | 
 58 |     @property
 59 |     def metrics(self):
 60 |         return [getattr(self.inner_policy, self.attr)]
 61 | 
 62 |     def get_config(self):
 63 |         config = super(LinearAnnealedPolicy, self).get_config()
 64 |         config['attr'] = self.attr
 65 |         config['value_max'] = self.value_max
 66 |         config['value_min'] = self.value_min
 67 |         config['value_test'] = self.value_test
 68 |         config['nb_steps'] = self.nb_steps
 69 |         config['inner_policy'] = get_object_config(self.inner_policy)
 70 |         return config
 71 | 
 72 | 
 73 | class EpsGreedyQPolicy(Policy):
 74 |     def __init__(self, eps=.1):
 75 |         super(EpsGreedyQPolicy, self).__init__()
 76 |         self.eps = eps
 77 | 
 78 |     def select_action(self, q_values):
 79 |         assert q_values.ndim == 1
 80 |         nb_actions = q_values.shape[0]
 81 | 
 82 |         if np.random.uniform() < self.eps:
 83 |             action = np.random.random_integers(0, nb_actions-1)
 84 |         else:
 85 |             action = np.argmax(q_values)
 86 |         return action
 87 | 
 88 |     def get_config(self):
 89 |         config = super(EpsGreedyQPolicy, self).get_config()
 90 |         config['eps'] = self.eps
 91 |         return config
 92 | 
 93 | 
 94 | class GreedyQPolicy(Policy):
 95 |     def select_action(self, q_values):
 96 |         assert q_values.ndim == 1
 97 |         action = np.argmax(q_values)
 98 |         return action
 99 | 
100 | 
101 | class BoltzmannQPolicy(Policy):
102 |     def __init__(self, tau=1., clip=(-500., 500.)):
103 |         super(BoltzmannQPolicy, self).__init__()
104 |         self.tau = tau
105 |         self.clip = clip
106 | 
107 |     def select_action(self, q_values):
108 |         assert q_values.ndim == 1
109 |         q_values = q_values.astype('float64')
110 |         nb_actions = q_values.shape[0]
111 | 
112 |         exp_values = np.exp(np.clip(q_values / self.tau, self.clip[0], self.clip[1]))
113 |         probs = exp_values / np.sum(exp_values)
114 |         action = np.random.choice(range(nb_actions), p=probs)
115 |         return action
116 | 
117 |     def get_config(self):
118 |         config = super(BoltzmannQPolicy, self).get_config()
119 |         config['tau'] = self.tau
120 |         config['clip'] = self.clip
121 |         return config
122 | 


--------------------------------------------------------------------------------
/rl/util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from keras.models import model_from_config, Sequential, Model, model_from_config
  4 | import keras.optimizers as optimizers
  5 | import keras.backend as K
  6 | 
  7 | 
  8 | def clone_model(model, custom_objects={}):
  9 |     # Requires Keras 1.0.7 since get_config has breaking changes.
 10 |     config = {
 11 |         'class_name': model.__class__.__name__,
 12 |         'config': model.get_config(),
 13 |     }
 14 |     clone = model_from_config(config, custom_objects=custom_objects)
 15 |     clone.set_weights(model.get_weights())
 16 |     return clone
 17 | 
 18 | 
 19 | def clone_optimizer(optimizer):
 20 |     if type(optimizer) is str:
 21 |         return optimizers.get(optimizer)
 22 |     # Requires Keras 1.0.7 since get_config has breaking changes.
 23 |     params = dict([(k, v) for k, v in optimizer.get_config().items()])
 24 |     config = {
 25 |         'class_name': optimizer.__class__.__name__,
 26 |         'config': params,
 27 |     }
 28 |     if hasattr(optimizers, 'optimizer_from_config'):
 29 |         # COMPATIBILITY: Keras < 2.0
 30 |         clone = optimizers.optimizer_from_config(config)
 31 |     else:
 32 |         clone = optimizers.deserialize(config)
 33 |     return clone
 34 | 
 35 | 
 36 | def get_soft_target_model_updates(target, source, tau):
 37 |     target_weights = target.trainable_weights + sum([l.non_trainable_weights for l in target.layers], [])
 38 |     source_weights = source.trainable_weights + sum([l.non_trainable_weights for l in source.layers], [])
 39 |     assert len(target_weights) == len(source_weights)
 40 | 
 41 |     # Create updates.
 42 |     updates = []
 43 |     for tw, sw in zip(target_weights, source_weights):
 44 |         updates.append((tw, tau * sw + (1. - tau) * tw))
 45 |     return updates
 46 | 
 47 | 
 48 | def get_object_config(o):
 49 |     if o is None:
 50 |         return None
 51 |         
 52 |     config = {
 53 |         'class_name': o.__class__.__name__,
 54 |         'config': o.get_config()
 55 |     }
 56 |     return config
 57 | 
 58 | 
 59 | def huber_loss(y_true, y_pred, clip_value):
 60 |     # Huber loss, see https://en.wikipedia.org/wiki/Huber_loss and
 61 |     # https://medium.com/@karpathy/yes-you-should-understand-backprop-e2f06eab496b
 62 |     # for details.
 63 |     assert clip_value > 0.
 64 | 
 65 |     x = y_true - y_pred
 66 |     if np.isinf(clip_value):
 67 |         # Spacial case for infinity since Tensorflow does have problems
 68 |         # if we compare `K.abs(x) < np.inf`.
 69 |         return .5 * K.square(x)
 70 | 
 71 |     condition = K.abs(x) < clip_value
 72 |     squared_loss = .5 * K.square(x)
 73 |     linear_loss = clip_value * (K.abs(x) - .5 * clip_value)
 74 |     if K.backend() == 'tensorflow':
 75 |         import tensorflow as tf
 76 |         if hasattr(tf, 'select'):
 77 |             return tf.select(condition, squared_loss, linear_loss)  # condition, true, false
 78 |         else:
 79 |             return tf.where(condition, squared_loss, linear_loss)  # condition, true, false
 80 |     elif K.backend() == 'theano':
 81 |         from theano import tensor as T
 82 |         return T.switch(condition, squared_loss, linear_loss)
 83 |     else:
 84 |         raise RuntimeError('Unknown backend "{}".'.format(K.backend()))
 85 | 
 86 | 
 87 | class AdditionalUpdatesOptimizer(optimizers.Optimizer):
 88 |     def __init__(self, optimizer, additional_updates):
 89 |         super(AdditionalUpdatesOptimizer, self).__init__()
 90 |         self.optimizer = optimizer
 91 |         self.additional_updates = additional_updates
 92 | 
 93 |     def get_updates(self, params, constraints, loss):
 94 |         updates = self.optimizer.get_updates(params, constraints, loss)
 95 |         updates += self.additional_updates
 96 |         self.updates = updates
 97 |         return self.updates
 98 | 
 99 |     def get_config(self):
100 |         return self.optimizer.get_config()
101 | 
102 | 
103 | # Based on https://github.com/openai/baselines/blob/master/baselines/common/mpi_running_mean_std.py
104 | class WhiteningNormalizer(object):
105 |     def __init__(self, shape, eps=1e-2, dtype=np.float64):
106 |         self.eps = eps
107 |         self.shape = shape
108 |         self.dtype = dtype
109 | 
110 |         self._sum = np.zeros(shape, dtype=dtype)
111 |         self._sumsq = np.zeros(shape, dtype=dtype)
112 |         self._count = 0
113 | 
114 |         self.mean = np.zeros(shape, dtype=dtype)
115 |         self.std = np.ones(shape, dtype=dtype)
116 | 
117 |     def normalize(self, x):
118 |         return (x - self.mean) / self.std
119 | 
120 |     def denormalize(self, x):
121 |         return self.std * x + self.mean
122 | 
123 |     def update(self, x):
124 |         if x.ndim == len(self.shape):
125 |             x = x.reshape(-1, *self.shape)
126 |         assert x.shape[1:] == self.shape
127 | 
128 |         self._count += x.shape[0]
129 |         self._sum += np.sum(x, axis=0)
130 |         self._sumsq += np.sum(np.square(x), axis=0)
131 | 
132 |         self.mean = self._sum / float(self._count)
133 |         self.std = np.sqrt(np.maximum(np.square(self.eps), self._sumsq / float(self._count) - np.square(self.mean)))
134 | 


--------------------------------------------------------------------------------
/tests/integration/test_discrete.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import numpy as np
  4 | from numpy.testing import assert_allclose
  5 | from gym.envs.debugging.two_round_deterministic_reward import TwoRoundDeterministicRewardEnv
  6 | 
  7 | from keras.models import Sequential
  8 | from keras.layers import Dense, Activation, Flatten
  9 | from keras.optimizers import Adam
 10 | from rl.agents import DQNAgent, CEMAgent, SARSAAgent
 11 | from rl.policy import EpsGreedyQPolicy
 12 | from rl.memory import SequentialMemory, EpisodeParameterMemory
 13 | 
 14 | 
 15 | def test_dqn():
 16 |     env = TwoRoundDeterministicRewardEnv()
 17 |     np.random.seed(123)
 18 |     env.seed(123)
 19 |     random.seed(123)
 20 |     nb_actions = env.action_space.n
 21 | 
 22 |     # Next, we build a very simple model.
 23 |     model = Sequential()
 24 |     model.add(Dense(16, input_shape=(1,)))
 25 |     model.add(Activation('relu'))
 26 |     model.add(Dense(nb_actions))
 27 |     model.add(Activation('linear'))
 28 | 
 29 |     memory = SequentialMemory(limit=1000, window_length=1)
 30 |     policy = EpsGreedyQPolicy(eps=.1)
 31 |     dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50,
 32 |                    target_model_update=1e-1, policy=policy, enable_double_dqn=False)
 33 |     dqn.compile(Adam(lr=1e-3))
 34 | 
 35 |     dqn.fit(env, nb_steps=2000, visualize=False, verbose=0)
 36 |     policy.eps = 0.
 37 |     h = dqn.test(env, nb_episodes=20, visualize=False)
 38 |     assert_allclose(np.mean(h.history['episode_reward']), 3.)
 39 | 
 40 | 
 41 | def test_double_dqn():
 42 |     env = TwoRoundDeterministicRewardEnv()
 43 |     np.random.seed(123)
 44 |     env.seed(123)
 45 |     random.seed(123)
 46 |     nb_actions = env.action_space.n
 47 | 
 48 |     # Next, we build a very simple model.
 49 |     model = Sequential()
 50 |     model.add(Dense(16, input_shape=(1,)))
 51 |     model.add(Activation('relu'))
 52 |     model.add(Dense(nb_actions))
 53 |     model.add(Activation('linear'))
 54 | 
 55 |     memory = SequentialMemory(limit=1000, window_length=1)
 56 |     policy = EpsGreedyQPolicy(eps=.1)
 57 |     dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50,
 58 |                    target_model_update=1e-1, policy=policy, enable_double_dqn=True)
 59 |     dqn.compile(Adam(lr=1e-3))
 60 | 
 61 |     dqn.fit(env, nb_steps=2000, visualize=False, verbose=0)
 62 |     policy.eps = 0.
 63 |     h = dqn.test(env, nb_episodes=20, visualize=False)
 64 |     assert_allclose(np.mean(h.history['episode_reward']), 3.)
 65 | 
 66 | 
 67 | def test_cem():
 68 |     env = TwoRoundDeterministicRewardEnv()
 69 |     np.random.seed(123)
 70 |     env.seed(123)
 71 |     random.seed(123)
 72 |     nb_actions = env.action_space.n
 73 | 
 74 |     # Next, we build a very simple model.
 75 |     model = Sequential()
 76 |     model.add(Dense(16, input_shape=(1,)))
 77 |     model.add(Activation('relu'))
 78 |     model.add(Dense(nb_actions))
 79 |     model.add(Activation('linear'))
 80 | 
 81 |     memory = EpisodeParameterMemory(limit=1000, window_length=1)
 82 |     dqn = CEMAgent(model=model, nb_actions=nb_actions, memory=memory)
 83 |     dqn.compile()
 84 | 
 85 |     dqn.fit(env, nb_steps=2000, visualize=False, verbose=1)
 86 |     h = dqn.test(env, nb_episodes=20, visualize=False)
 87 |     assert_allclose(np.mean(h.history['episode_reward']), 3.)
 88 | 
 89 | 
 90 | def test_duel_dqn():
 91 |     env = TwoRoundDeterministicRewardEnv()
 92 |     np.random.seed(123)
 93 |     env.seed(123)
 94 |     random.seed(123)
 95 |     nb_actions = env.action_space.n
 96 | 
 97 |     # Next, we build a very simple model.
 98 |     model = Sequential()
 99 |     model.add(Dense(16, input_shape=(1,)))
100 |     model.add(Activation('relu'))
101 |     model.add(Dense(nb_actions, activation='linear'))
102 | 
103 |     memory = SequentialMemory(limit=1000, window_length=1)
104 |     policy = EpsGreedyQPolicy(eps=.1)
105 |     dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50,
106 |                    target_model_update=1e-1, policy=policy, enable_double_dqn=False, enable_dueling_network=True)
107 |     dqn.compile(Adam(lr=1e-3))
108 | 
109 |     dqn.fit(env, nb_steps=2000, visualize=False, verbose=0)
110 |     policy.eps = 0.
111 |     h = dqn.test(env, nb_episodes=20, visualize=False)
112 |     assert_allclose(np.mean(h.history['episode_reward']), 3.)
113 | 
114 | 
115 | def test_sarsa():
116 |     env = TwoRoundDeterministicRewardEnv()
117 |     np.random.seed(123)
118 |     env.seed(123)
119 |     random.seed(123)
120 |     nb_actions = env.action_space.n
121 | 
122 |     # Next, we build a very simple model.
123 |     model = Sequential()
124 |     model.add(Dense(16, input_shape=(1,)))
125 |     model.add(Activation('relu'))
126 |     model.add(Dense(nb_actions, activation='linear'))
127 | 
128 |     policy = EpsGreedyQPolicy(eps=.1)
129 |     sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy)
130 |     sarsa.compile(Adam(lr=1e-3))
131 | 
132 |     sarsa.fit(env, nb_steps=20000, visualize=False, verbose=0)
133 |     policy.eps = 0.
134 |     h = sarsa.test(env, nb_episodes=20, visualize=False)
135 |     assert_allclose(np.mean(h.history['episode_reward']), 3.)
136 | 


--------------------------------------------------------------------------------
/examples/dqn_atari.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import argparse
  3 | 
  4 | from PIL import Image
  5 | import numpy as np
  6 | import gym
  7 | 
  8 | from keras.models import Sequential
  9 | from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
 10 | from keras.optimizers import Adam
 11 | import keras.backend as K
 12 | 
 13 | from rl.agents.dqn import DQNAgent
 14 | from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
 15 | from rl.memory import SequentialMemory
 16 | from rl.core import Processor
 17 | from rl.callbacks import FileLogger, ModelIntervalCheckpoint
 18 | 
 19 | 
 20 | INPUT_SHAPE = (84, 84)
 21 | WINDOW_LENGTH = 4
 22 | 
 23 | 
 24 | class AtariProcessor(Processor):
 25 |     def process_observation(self, observation):
 26 |         assert observation.ndim == 3  # (height, width, channel)
 27 |         img = Image.fromarray(observation)
 28 |         img = img.resize(INPUT_SHAPE).convert('L')  # resize and convert to grayscale
 29 |         processed_observation = np.array(img)
 30 |         assert processed_observation.shape == INPUT_SHAPE
 31 |         return processed_observation.astype('uint8')  # saves storage in experience memory
 32 | 
 33 |     def process_state_batch(self, batch):
 34 |         # We could perform this processing step in `process_observation`. In this case, however,
 35 |         # we would need to store a `float32` array instead, which is 4x more memory intensive than
 36 |         # an `uint8` array. This matters if we store 1M observations.
 37 |         processed_batch = batch.astype('float32') / 255.
 38 |         return processed_batch
 39 | 
 40 |     def process_reward(self, reward):
 41 |         return np.clip(reward, -1., 1.)
 42 | 
 43 | parser = argparse.ArgumentParser()
 44 | parser.add_argument('--mode', choices=['train', 'test'], default='train')
 45 | parser.add_argument('--env-name', type=str, default='BreakoutDeterministic-v4')
 46 | parser.add_argument('--weights', type=str, default=None)
 47 | args = parser.parse_args()
 48 | 
 49 | # Get the environment and extract the number of actions.
 50 | env = gym.make(args.env_name)
 51 | np.random.seed(123)
 52 | env.seed(123)
 53 | nb_actions = env.action_space.n
 54 | 
 55 | # Next, we build our model. We use the same model that was described by Mnih et al. (2015).
 56 | input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
 57 | model = Sequential()
 58 | if K.image_dim_ordering() == 'tf':
 59 |     # (width, height, channels)
 60 |     model.add(Permute((2, 3, 1), input_shape=input_shape))
 61 | elif K.image_dim_ordering() == 'th':
 62 |     # (channels, width, height)
 63 |     model.add(Permute((1, 2, 3), input_shape=input_shape))
 64 | else:
 65 |     raise RuntimeError('Unknown image_dim_ordering.')
 66 | model.add(Convolution2D(32, 8, 8, subsample=(4, 4)))
 67 | model.add(Activation('relu'))
 68 | model.add(Convolution2D(64, 4, 4, subsample=(2, 2)))
 69 | model.add(Activation('relu'))
 70 | model.add(Convolution2D(64, 3, 3, subsample=(1, 1)))
 71 | model.add(Activation('relu'))
 72 | model.add(Flatten())
 73 | model.add(Dense(512))
 74 | model.add(Activation('relu'))
 75 | model.add(Dense(nb_actions))
 76 | model.add(Activation('linear'))
 77 | print(model.summary())
 78 | 
 79 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
 80 | # even the metrics!
 81 | memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
 82 | processor = AtariProcessor()
 83 | 
 84 | # Select a policy. We use eps-greedy action selection, which means that a random action is selected
 85 | # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
 86 | # the agent initially explores the environment (high eps) and then gradually sticks to what it knows
 87 | # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
 88 | # so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
 89 | policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
 90 |                               nb_steps=1000000)
 91 | 
 92 | # The trade-off between exploration and exploitation is difficult and an on-going research topic.
 93 | # If you want, you can experiment with the parameters or use a different policy. Another popular one
 94 | # is Boltzmann-style exploration:
 95 | # policy = BoltzmannQPolicy(tau=1.)
 96 | # Feel free to give it a try!
 97 | 
 98 | dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
 99 |                processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
100 |                train_interval=4, delta_clip=1.)
101 | dqn.compile(Adam(lr=.00025), metrics=['mae'])
102 | 
103 | if args.mode == 'train':
104 |     # Okay, now it's time to learn something! We capture the interrupt exception so that training
105 |     # can be prematurely aborted. Notice that you can the built-in Keras callbacks!
106 |     weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name)
107 |     checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f'
108 |     log_filename = 'dqn_{}_log.json'.format(args.env_name)
109 |     callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)]
110 |     callbacks += [FileLogger(log_filename, interval=100)]
111 |     dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000)
112 | 
113 |     # After training is done, we save the final weights one more time.
114 |     dqn.save_weights(weights_filename, overwrite=True)
115 | 
116 |     # Finally, evaluate our algorithm for 10 episodes.
117 |     dqn.test(env, nb_episodes=10, visualize=False)
118 | elif args.mode == 'test':
119 |     weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name)
120 |     if args.weights:
121 |         weights_filename = args.weights
122 |     dqn.load_weights(weights_filename)
123 |     dqn.test(env, nb_episodes=10, visualize=True)
124 | 


--------------------------------------------------------------------------------
/docs/sources/agents/overview.md:
--------------------------------------------------------------------------------
  1 | ## Available Agents
  2 | 
  3 | | Name                   | Implementation         | Observation Space  | Action Space   | 
  4 | | ---------------------- |------------------------| -------------------| ---------------|
  5 | | [DQN](/agents/dqn)     | `rl.agents.DQNAgent`   | discrete or continuous | discrete   | 
  6 | | [DDPG](/agents/ddpg)   | `rl.agents.DDPGAgent`  | discrete or continuous | continuous | 
  7 | | [NAF](/agents/naf)     | `rl.agents.NAFAgent`   | discrete or continuous | continuous |
  8 | | [CEM](/agents/cem)     | `rl.agents.CEMAgent`   | discrete or continuous | discrete   |
  9 | | [SARSA](/agents/sarsa) | `rl.agents.SARSAAgent` | discrete or continuous | discrete   | 
 10 | 
 11 | ---
 12 | 
 13 | ## Common API
 14 | 
 15 | All agents share a common API. This allows you to easily switch between different agents.
 16 | That being said, keep in mind that some agents make assumptions regarding the action space, i.e. assume discrete
 17 | or continuous actions.
 18 | 
 19 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L44)</span>
 20 | 
 21 | ### fit
 22 | 
 23 | 
 24 | ```python
 25 | fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None)
 26 | ```
 27 | 
 28 | 
 29 | Trains the agent on the given environment.
 30 | 
 31 | __Arguments__
 32 | 
 33 | - __env:__ (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
 34 | - __nb_steps__ (integer): Number of training steps to be performed.
 35 | - __action_repetition__ (integer): Number of times the agent repeats the same action without
 36 | 	observing the environment again. Setting this to a value > 1 can be useful
 37 | 	if a single action only has a very small effect on the environment.
 38 | - __callbacks__ (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
 39 | 	List of callbacks to apply during training. See [callbacks](/callbacks) for details.
 40 | - __verbose__ (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
 41 | - __visualize__ (boolean): If `True`, the environment is visualized during training. However,
 42 | 	this is likely going to slow down training significantly and is thus intended to be
 43 | 	a debugging instrument.
 44 | - __nb_max_start_steps__ (integer): Number of maximum steps that the agent performs at the beginning
 45 | 	of each episode using `start_step_policy`. Notice that this is an upper limit since
 46 | 	the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
 47 | 	at the beginning of each episode.
 48 | - __start_step_policy__ (`lambda observation: action`): The policy
 49 | 	to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
 50 | - __log_interval__ (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
 51 | - __nb_max_episode_steps__ (integer): Number of steps per episode that the agent performs before
 52 | 	automatically resetting the environment. Set to `None` if each episode should run
 53 | 	(potentially indefinitely) until the environment signals a terminal state.
 54 | 
 55 | __Returns__
 56 | 
 57 | A `keras.callbacks.History` instance that recorded the entire training process.
 58 | 
 59 | ----
 60 | 
 61 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L231)</span>
 62 | 
 63 | ### test
 64 | 
 65 | 
 66 | ```python
 67 | test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1)
 68 | ```
 69 | 
 70 | 
 71 | Callback that is called before training begins."
 72 | 
 73 | ----
 74 | 
 75 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L391)</span>
 76 | 
 77 | ### compile
 78 | 
 79 | 
 80 | ```python
 81 | compile(self, optimizer, metrics=[])
 82 | ```
 83 | 
 84 | 
 85 | Compiles an agent and the underlaying models to be used for training and testing.
 86 | 
 87 | __Arguments__
 88 | 
 89 | - __optimizer__ (`keras.optimizers.Optimizer` instance): The optimizer to be used during training.
 90 | - __metrics__ (list of functions `lambda y_true, y_pred: metric`): The metrics to run during training.
 91 | 
 92 | ----
 93 | 
 94 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L39)</span>
 95 | 
 96 | ### get_config
 97 | 
 98 | 
 99 | ```python
100 | get_config(self)
101 | ```
102 | 
103 | 
104 | Configuration of the agent for serialization.
105 | 
106 | ----
107 | 
108 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L364)</span>
109 | 
110 | ### reset_states
111 | 
112 | 
113 | ```python
114 | reset_states(self)
115 | ```
116 | 
117 | 
118 | Resets all internally kept states after an episode is completed.
119 | 
120 | ----
121 | 
122 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L400)</span>
123 | 
124 | ### load_weights
125 | 
126 | 
127 | ```python
128 | load_weights(self, filepath)
129 | ```
130 | 
131 | 
132 | Loads the weights of an agent from an HDF5 file.
133 | 
134 | __Arguments__
135 | 
136 | - __filepath__ (str): The path to the HDF5 file.
137 | 
138 | ----
139 | 
140 | <span style="float:right;">[[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L408)</span>
141 | 
142 | ### save_weights
143 | 
144 | 
145 | ```python
146 | save_weights(self, filepath, overwrite=False)
147 | ```
148 | 
149 | 
150 | Saves the weights of an agent as an HDF5 file.
151 | 
152 | __Arguments__
153 | 
154 | - __filepath__ (str): The path to where the weights should be saved.
155 | - __overwrite__ (boolean): If `False` and `filepath` already exists, raises an error.
156 | 
157 | 


--------------------------------------------------------------------------------
/tests/rl/test_core.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import pytest
  3 | import numpy as np
  4 | from numpy.testing import assert_allclose
  5 | 
  6 | from rl.memory import SequentialMemory
  7 | from rl.core import Agent, Env, Processor
  8 | 
  9 | 
 10 | class TestEnv(Env):
 11 |     def __init__(self):
 12 |         super(TestEnv, self).__init__()
 13 | 
 14 |     def step(self, action):
 15 |         self.state += 1
 16 |         done = self.state >= 6
 17 |         reward = float(self.state) / 10.
 18 |         return np.array(self.state), reward, done, {}
 19 | 
 20 |     def reset(self):
 21 |         self.state = 1
 22 |         return np.array(self.state)
 23 | 
 24 |     def seed(self, seed=None):
 25 |         pass
 26 | 
 27 |     def configure(self, *args, **kwargs):
 28 |         pass
 29 | 
 30 | 
 31 | class TestAgent(Agent):
 32 |     def __init__(self, memory, **kwargs):
 33 |         super(TestAgent, self).__init__(**kwargs)
 34 |         self.memory = memory
 35 | 
 36 |     def forward(self, observation):
 37 |         action = observation
 38 |         self.recent_action = action
 39 |         self.recent_observation = observation
 40 |         return action
 41 | 
 42 |     def backward(self, reward, terminal):
 43 |         metrics = [np.nan for _ in self.metrics_names]
 44 |         self.memory.append(self.recent_observation, self.recent_action, reward, terminal)
 45 |         return metrics
 46 | 
 47 |     def compile(self):
 48 |         self.compiled = True
 49 | 
 50 | 
 51 | def test_fit_observations():
 52 |     memory = SequentialMemory(100, window_length=2, ignore_episode_boundaries=False)
 53 |     agent = TestAgent(memory)
 54 |     env = TestEnv()
 55 |     agent.compile()
 56 |     agent.fit(env, 20, verbose=0)
 57 | 
 58 |     # Inspect memory to see if observations are correct.
 59 |     experiencies = memory.sample(batch_size=8, batch_idxs=range(8))
 60 |     
 61 |     assert experiencies[0].reward == .2
 62 |     assert experiencies[0].action == 1
 63 |     assert_allclose(experiencies[0].state0, np.array([0, 1]))
 64 |     assert_allclose(experiencies[0].state1, np.array([1, 2]))
 65 |     assert experiencies[0].terminal1 is False
 66 |     
 67 |     assert experiencies[1].reward == .3
 68 |     assert experiencies[1].action == 2
 69 |     assert_allclose(experiencies[1].state0, np.array([1, 2]))
 70 |     assert_allclose(experiencies[1].state1, np.array([2, 3]))
 71 |     assert experiencies[1].terminal1 is False
 72 | 
 73 |     assert experiencies[2].reward == .4
 74 |     assert experiencies[2].action == 3
 75 |     assert_allclose(experiencies[2].state0, np.array([2, 3]))
 76 |     assert_allclose(experiencies[2].state1, np.array([3, 4]))
 77 |     assert experiencies[2].terminal1 is False
 78 | 
 79 |     assert experiencies[3].reward == .5
 80 |     assert experiencies[3].action == 4
 81 |     assert_allclose(experiencies[3].state0, np.array([3, 4]))
 82 |     assert_allclose(experiencies[3].state1, np.array([4, 5]))
 83 |     assert experiencies[3].terminal1 is False
 84 | 
 85 |     assert experiencies[4].reward == .6
 86 |     assert experiencies[4].action == 5
 87 |     assert_allclose(experiencies[4].state0, np.array([4, 5]))
 88 |     assert_allclose(experiencies[4].state1, np.array([5, 6]))
 89 |     assert experiencies[4].terminal1 is True
 90 | 
 91 |     # Experience 5 has been re-sampled since since state0 would be terminal in which case we
 92 |     # cannot really have a meaningful transition because the environment gets reset. We thus
 93 |     # just ensure that state0 is not terminal.
 94 |     assert not np.all(experiencies[5].state0 == np.array([5, 6]))
 95 | 
 96 |     assert experiencies[6].reward == .2
 97 |     assert experiencies[6].action == 1
 98 |     assert_allclose(experiencies[6].state0, np.array([0, 1]))
 99 |     assert_allclose(experiencies[6].state1, np.array([1, 2]))
100 |     assert experiencies[6].terminal1 is False
101 | 
102 |     assert experiencies[7].reward == .3
103 |     assert experiencies[7].action == 2
104 |     assert_allclose(experiencies[7].state0, np.array([1, 2]))
105 |     assert_allclose(experiencies[7].state1, np.array([2, 3]))
106 |     assert experiencies[7].terminal1 is False
107 | 
108 | 
109 | def test_copy_observations():
110 |     methods = [
111 |         'fit',
112 |         'test',
113 |     ]
114 | 
115 |     for method in methods:
116 |         original_observations = []
117 |         
118 |         class LocalEnv(Env):
119 |             def __init__(self):
120 |                 super(LocalEnv, self).__init__()
121 | 
122 |             def step(self, action):
123 |                 self.state += 1
124 |                 done = self.state >= 6
125 |                 reward = float(self.state) / 10.
126 |                 obs = np.array(self.state)
127 |                 original_observations.append(obs)
128 |                 return obs, reward, done, {}
129 | 
130 |             def reset(self):
131 |                 self.state = 1
132 |                 return np.array(self.state)
133 | 
134 |             def seed(self, seed=None):
135 |                 pass
136 | 
137 |             def configure(self, *args, **kwargs):
138 |                 pass
139 | 
140 |         # Slight abuse of the processor for test purposes.
141 |         observations = []
142 | 
143 |         class LocalProcessor(Processor):
144 |             def process_step(self, observation, reward, done, info):
145 |                 observations.append(observation)
146 |                 return observation, reward, done, info
147 | 
148 |         processor = LocalProcessor()
149 |         memory = SequentialMemory(100, window_length=1)
150 |         agent = TestAgent(memory, processor=processor)
151 |         env = LocalEnv()
152 |         agent.compile()
153 |         getattr(agent, method)(env, 20, verbose=0, visualize=False)
154 | 
155 |         assert len(observations) == len(original_observations)
156 |         assert_allclose(np.array(observations), np.array(original_observations))
157 |         assert np.all([o is not o_ for o, o_ in zip(original_observations, observations)])
158 |     
159 | 
160 | if __name__ == '__main__':
161 |     pytest.main([__file__])
162 | 


--------------------------------------------------------------------------------
/rl/agents/cem.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from collections import deque
  3 | from copy import deepcopy
  4 | 
  5 | import numpy as np
  6 | import keras.backend as K
  7 | from keras.models import Model
  8 | 
  9 | from rl.core import Agent
 10 | from rl.util import *
 11 | 
 12 | class CEMAgent(Agent):
 13 |     """Write me
 14 |     """
 15 |     def __init__(self, model, nb_actions, memory, batch_size=50, nb_steps_warmup=1000,
 16 |                  train_interval=50, elite_frac=0.05, memory_interval=1, theta_init=None,
 17 |                  noise_decay_const=0.0, noise_ampl=0.0, **kwargs):
 18 |         super(CEMAgent, self).__init__(**kwargs)
 19 | 
 20 |         # Parameters.
 21 |         self.nb_actions = nb_actions
 22 |         self.batch_size = batch_size
 23 |         self.elite_frac = elite_frac
 24 |         self.num_best = int(self.batch_size * self.elite_frac)
 25 |         self.nb_steps_warmup = nb_steps_warmup
 26 |         self.train_interval = train_interval
 27 |         self.memory_interval = memory_interval
 28 |         
 29 |         # if using noisy CEM, the minimum standard deviation will be ampl * exp (- decay_const * step )
 30 |         self.noise_decay_const = noise_decay_const
 31 |         self.noise_ampl = noise_ampl
 32 |                 
 33 |         # default initial mean & cov, override this by passing an theta_init argument
 34 |         self.init_mean = 0.0
 35 |         self.init_stdev = 1.0
 36 |         
 37 |         # Related objects.
 38 |         self.memory = memory
 39 |         self.model = model
 40 |         self.shapes = [w.shape for w in model.get_weights()]
 41 |         self.sizes = [w.size for w in model.get_weights()]
 42 |         self.num_weights = sum(self.sizes)
 43 |         
 44 |         # store the best result seen during training, as a tuple (reward, flat_weights)
 45 |         self.best_seen = (-np.inf, np.zeros(self.num_weights))
 46 | 
 47 |         self.theta = np.zeros(self.num_weights*2)
 48 |         self.update_theta(theta_init)
 49 | 
 50 |         # State.
 51 |         self.episode = 0
 52 |         self.compiled = False
 53 |         self.reset_states()
 54 | 
 55 |     def compile(self):
 56 |         self.model.compile(optimizer='sgd', loss='mse')
 57 |         self.compiled = True
 58 | 
 59 |     def load_weights(self, filepath):
 60 |         self.model.load_weights(filepath)
 61 | 
 62 |     def save_weights(self, filepath, overwrite=False):
 63 |         self.model.save_weights(filepath, overwrite=overwrite)
 64 | 
 65 |     def get_weights_flat(self,weights):
 66 |         weights_flat = np.zeros(self.num_weights)
 67 | 
 68 |         pos = 0
 69 |         for i_layer, size in enumerate(self.sizes):
 70 |             weights_flat[pos:pos+size] = weights[i_layer].flatten()
 71 |             pos += size
 72 |         return weights_flat
 73 |         
 74 |     def get_weights_list(self,weights_flat):
 75 |         weights = []
 76 |         pos = 0
 77 |         for i_layer, size in enumerate(self.sizes):
 78 |             arr = weights_flat[pos:pos+size].reshape(self.shapes[i_layer])
 79 |             weights.append(arr)
 80 |             pos += size
 81 |         return weights          
 82 | 
 83 |     def reset_states(self):
 84 |         self.recent_observation = None
 85 |         self.recent_action = None
 86 | 
 87 |     def select_action(self, state, stochastic=False):
 88 |         batch = np.array([state])
 89 |         if self.processor is not None:
 90 |             batch = self.processor.process_state_batch(batch)
 91 | 
 92 |         action = self.model.predict_on_batch(batch).flatten()
 93 |         if stochastic or self.training:
 94 |             return np.random.choice(np.arange(self.nb_actions), p=np.exp(action) / np.sum(np.exp(action)))
 95 |         return np.argmax(action)
 96 |     
 97 |     def update_theta(self,theta):
 98 |         if (theta is not None):
 99 |             assert theta.shape == self.theta.shape, "Invalid theta, shape is {0} but should be {1}".format(theta.shape,self.theta.shape)
100 |             assert (not np.isnan(theta).any()), "Invalid theta, NaN encountered"
101 |             assert (theta[self.num_weights:] >= 0.).all(), "Invalid theta, standard deviations must be nonnegative"            
102 |             self.theta = theta
103 |         else:
104 |             means = np.ones(self.num_weights) * self.init_mean
105 |             stdevs = np.ones(self.num_weights) * self.init_stdev
106 |             self.theta = np.hstack((means,stdevs))
107 | 
108 |     def choose_weights(self):
109 |         mean = self.theta[:self.num_weights]
110 |         std = self.theta[self.num_weights:]
111 |         weights_flat = std * np.random.randn(self.num_weights) + mean
112 | 
113 |         sampled_weights = self.get_weights_list(weights_flat)
114 |         self.model.set_weights(sampled_weights)
115 | 
116 |     def forward(self, observation):
117 |         # Select an action.
118 |         state = self.memory.get_recent_state(observation)
119 |         action = self.select_action(state)
120 |         if self.processor is not None:
121 |             action = self.processor.process_action(action)
122 | 
123 |         # Book-keeping.
124 |         self.recent_observation = observation
125 |         self.recent_action = action
126 | 
127 |         return action
128 | 
129 |     @property
130 |     def layers(self):
131 |         return self.model.layers[:]
132 |          
133 |     def backward(self, reward, terminal):
134 |         # Store most recent experience in memory.
135 |         if self.step % self.memory_interval == 0:
136 |             self.memory.append(self.recent_observation, self.recent_action, reward, terminal,
137 |                                training=self.training)
138 | 
139 |         metrics = [np.nan for _ in self.metrics_names]
140 |         if not self.training:
141 |             # We're done here. No need to update the experience memory since we only use the working
142 |             # memory to obtain the state over the most recent observations.
143 |             return metrics
144 | 
145 |         if terminal:
146 |             params = self.get_weights_flat(self.model.get_weights())
147 |             self.memory.finalize_episode(params)
148 | 
149 |             if self.step > self.nb_steps_warmup and self.episode % self.train_interval == 0:
150 |                 params, reward_totals = self.memory.sample(self.batch_size)
151 |                 best_idx = np.argsort(np.array(reward_totals))[-self.num_best:]
152 |                 best = np.vstack([params[i] for i in best_idx])
153 | 
154 |                 if reward_totals[best_idx[-1]] > self.best_seen[0]:
155 |                     self.best_seen = (reward_totals[best_idx[-1]], params[best_idx[-1]])
156 |                     
157 |                 metrics = [np.mean(np.array(reward_totals)[best_idx])]
158 |                 if self.processor is not None:
159 |                     metrics += self.processor.metrics
160 |                 min_std = self.noise_ampl * np.exp(-self.step * self.noise_decay_const)
161 |                 
162 |                 mean = np.mean(best, axis=0)
163 |                 std = np.std(best, axis=0) + min_std
164 |                 new_theta = np.hstack((mean, std))
165 |                 self.update_theta(new_theta)
166 |             self.choose_weights()
167 |             self.episode += 1
168 |         return metrics
169 | 
170 |     def _on_train_end(self):
171 |         self.model.set_weights(self.get_weights_list(self.best_seen[1]))
172 | 
173 |     @property
174 |     def metrics_names(self):
175 |         names = ['mean_best_reward']
176 |         if self.processor is not None:
177 |             names += self.processor.metrics_names[:]
178 |         return names
179 | 


--------------------------------------------------------------------------------
/tests/rl/agents/test_dqn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import absolute_import
  3 | 
  4 | import pytest
  5 | import numpy as np
  6 | from numpy.testing import assert_allclose
  7 | 
  8 | from keras.models import Sequential
  9 | from keras.layers import Input, merge, Dense, Flatten
 10 | 
 11 | from rl.agents.dqn import NAFLayer, DQNAgent, NAFAgent
 12 | from rl.memory import SequentialMemory
 13 | from rl.processors import MultiInputProcessor
 14 | from rl.keras_future import concatenate, Model
 15 | 
 16 | from ..util import MultiInputTestEnv
 17 | 
 18 | 
 19 | def test_single_dqn_input():
 20 |     model = Sequential()
 21 |     model.add(Flatten(input_shape=(2, 3)))
 22 |     model.add(Dense(2))
 23 | 
 24 |     memory = SequentialMemory(limit=10, window_length=2)
 25 |     for double_dqn in (True, False):
 26 |         agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4,
 27 |                          enable_double_dqn=double_dqn)
 28 |         agent.compile('sgd')
 29 |         agent.fit(MultiInputTestEnv((3,)), nb_steps=10)
 30 | 
 31 | 
 32 | def test_multi_dqn_input():
 33 |     input1 = Input(shape=(2, 3))
 34 |     input2 = Input(shape=(2, 4))
 35 |     x = merge([input1, input2], mode='concat')
 36 |     x = Flatten()(x)
 37 |     x = Dense(2)(x)
 38 |     model = Model(input=[input1, input2], output=x)
 39 | 
 40 |     memory = SequentialMemory(limit=10, window_length=2)
 41 |     processor = MultiInputProcessor(nb_inputs=2)
 42 |     for double_dqn in (True, False):
 43 |         agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4,
 44 |                          processor=processor, enable_double_dqn=double_dqn)
 45 |         agent.compile('sgd')
 46 |         agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10)
 47 | 
 48 | 
 49 | def test_single_continuous_dqn_input():
 50 |     nb_actions = 2
 51 | 
 52 |     V_model = Sequential()
 53 |     V_model.add(Flatten(input_shape=(2, 3)))
 54 |     V_model.add(Dense(1))
 55 | 
 56 |     mu_model = Sequential()
 57 |     mu_model.add(Flatten(input_shape=(2, 3)))
 58 |     mu_model.add(Dense(nb_actions))
 59 | 
 60 |     L_input = Input(shape=(2, 3))
 61 |     L_input_action = Input(shape=(nb_actions,))
 62 |     x = concatenate([Flatten()(L_input), L_input_action])
 63 |     x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x)
 64 |     L_model = Model(input=[L_input_action, L_input], output=x)
 65 | 
 66 |     memory = SequentialMemory(limit=10, window_length=2)
 67 |     agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model,
 68 |                      memory=memory, nb_steps_warmup=5, batch_size=4)
 69 |     agent.compile('sgd')
 70 |     agent.fit(MultiInputTestEnv((3,)), nb_steps=10)
 71 | 
 72 | 
 73 | def test_multi_continuous_dqn_input():
 74 |     nb_actions = 2
 75 | 
 76 |     V_input1 = Input(shape=(2, 3))
 77 |     V_input2 = Input(shape=(2, 4))
 78 |     x = concatenate([V_input1, V_input2])
 79 |     x = Flatten()(x)
 80 |     x = Dense(1)(x)
 81 |     V_model = Model(input=[V_input1, V_input2], output=x)
 82 | 
 83 |     mu_input1 = Input(shape=(2, 3))
 84 |     mu_input2 = Input(shape=(2, 4))
 85 |     x = concatenate([mu_input1, mu_input2])
 86 |     x = Flatten()(x)
 87 |     x = Dense(nb_actions)(x)
 88 |     mu_model = Model(input=[mu_input1, mu_input2], output=x)
 89 | 
 90 |     L_input1 = Input(shape=(2, 3))
 91 |     L_input2 = Input(shape=(2, 4))
 92 |     L_input_action = Input(shape=(nb_actions,))
 93 |     x = concatenate([L_input1, L_input2])
 94 |     x = concatenate([Flatten()(x), L_input_action])
 95 |     x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x)
 96 |     L_model = Model(input=[L_input_action, L_input1, L_input2], output=x)
 97 | 
 98 |     memory = SequentialMemory(limit=10, window_length=2)
 99 |     processor = MultiInputProcessor(nb_inputs=2)
100 |     agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model,
101 |                      memory=memory, nb_steps_warmup=5, batch_size=4, processor=processor)
102 |     agent.compile('sgd')
103 |     agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10)
104 | 
105 | 
106 | def test_naf_layer_full():
107 |     batch_size = 2
108 |     for nb_actions in (1, 3):
109 |         # Construct single model with NAF as the only layer, hence it is fully deterministic
110 |         # since no weights are used, which would be randomly initialized.
111 |         L_flat_input = Input(shape=((nb_actions * nb_actions + nb_actions) // 2,))
112 |         mu_input = Input(shape=(nb_actions,))
113 |         action_input = Input(shape=(nb_actions,))
114 |         x = NAFLayer(nb_actions, mode='full')([L_flat_input, mu_input, action_input])
115 |         model = Model(input=[L_flat_input, mu_input, action_input], output=x)
116 |         model.compile(loss='mse', optimizer='sgd')
117 |         
118 |         # Create random test data.
119 |         L_flat = np.random.random((batch_size, (nb_actions * nb_actions + nb_actions) // 2)).astype('float32')
120 |         mu = np.random.random((batch_size, nb_actions)).astype('float32')
121 |         action = np.random.random((batch_size, nb_actions)).astype('float32')
122 | 
123 |         # Perform reference computations in numpy since these are much easier to verify.
124 |         L = np.zeros((batch_size, nb_actions, nb_actions)).astype('float32')
125 |         LT = np.copy(L)
126 |         for l, l_T, l_flat in zip(L, LT, L_flat):
127 |             l[np.tril_indices(nb_actions)] = l_flat
128 |             l[np.diag_indices(nb_actions)] = np.exp(l[np.diag_indices(nb_actions)])
129 |             l_T[:, :] = l.T
130 |         P = np.array([np.dot(l, l_T) for l, l_T in zip(L, LT)]).astype('float32')
131 |         A_ref = np.array([np.dot(np.dot(a - m, p), a - m) for a, m, p in zip(action, mu, P)]).astype('float32')
132 |         A_ref *= -.5
133 | 
134 |         # Finally, compute the output of the net, which should be identical to the previously
135 |         # computed reference.
136 |         A_net = model.predict([L_flat, mu, action]).flatten()
137 |         assert_allclose(A_net, A_ref, rtol=1e-5)
138 | 
139 | 
140 | def test_naf_layer_diag():
141 |     batch_size = 2
142 |     for nb_actions in (1, 3):
143 |         # Construct single model with NAF as the only layer, hence it is fully deterministic
144 |         # since no weights are used, which would be randomly initialized.
145 |         L_flat_input = Input(shape=(nb_actions,))
146 |         mu_input = Input(shape=(nb_actions,))
147 |         action_input = Input(shape=(nb_actions,))
148 |         x = NAFLayer(nb_actions, mode='diag')([L_flat_input, mu_input, action_input])
149 |         model = Model(input=[L_flat_input, mu_input, action_input], output=x)
150 |         model.compile(loss='mse', optimizer='sgd')
151 |         
152 |         # Create random test data.
153 |         L_flat = np.random.random((batch_size, nb_actions)).astype('float32')
154 |         mu = np.random.random((batch_size, nb_actions)).astype('float32')
155 |         action = np.random.random((batch_size, nb_actions)).astype('float32')
156 | 
157 |         # Perform reference computations in numpy since these are much easier to verify.
158 |         P = np.zeros((batch_size, nb_actions, nb_actions)).astype('float32')
159 |         for p, l_flat in zip(P, L_flat):
160 |             p[np.diag_indices(nb_actions)] = l_flat
161 |         print(P, L_flat)
162 |         A_ref = np.array([np.dot(np.dot(a - m, p), a - m) for a, m, p in zip(action, mu, P)]).astype('float32')
163 |         A_ref *= -.5
164 | 
165 |         # Finally, compute the output of the net, which should be identical to the previously
166 |         # computed reference.
167 |         A_net = model.predict([L_flat, mu, action]).flatten()
168 |         assert_allclose(A_net, A_ref, rtol=1e-5)
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     pytest.main([__file__])
173 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Deep Reinforcement Learning for Keras
  2 | [![Build Status](https://api.travis-ci.org/matthiasplappert/keras-rl.svg?branch=master)](https://travis-ci.org/matthiasplappert/keras-rl)
  3 | [![Documentation](https://readthedocs.org/projects/keras-rl/badge/)](http://keras-rl.readthedocs.io/)
  4 | [![License](https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000)](https://github.com/matthiasplappert/keras-rl/blob/master/LICENSE)
  5 | [![Join the chat at https://gitter.im/keras-rl/Lobby](https://badges.gitter.im/keras-rl/Lobby.svg)](https://gitter.im/keras-rl/Lobby)
  6 | 
  7 | 
  8 | <table>
  9 |   <tr>
 10 |     <td><img src="/assets/breakout.gif?raw=true" width="200"></td>
 11 |     <td><img src="/assets/cartpole.gif?raw=true" width="200"></td>
 12 |     <td><img src="/assets/pendulum.gif?raw=true" width="200"></td>
 13 |   </tr>
 14 | </table>
 15 | 
 16 | ## What is it?
 17 | `keras-rl` implements some state-of-the art deep reinforcement learning algorithms in Python and seamlessly integrates with the deep learning library [Keras](http://keras.io). Just like Keras, it works with either [Theano](http://deeplearning.net/software/theano/) or [TensorFlow](https://www.tensorflow.org/), which means that you can train your algorithm efficiently either on CPU or GPU.
 18 | Furthermore, `keras-rl` works with [OpenAI Gym](https://gym.openai.com/) out of the box. This means that evaluating and playing around with different algorithms is easy.
 19 | Of course you can extend `keras-rl` according to your own needs. You can use built-in Keras callbacks and metrics or define your own.
 20 | Even more so, it is easy to implement your own environments and even algorithms by simply extending some simple abstract classes.
 21 | 
 22 | In a nutshell: `keras-rl` makes it really easy to run state-of-the-art deep reinforcement learning algorithms, uses Keras and thus Theano or TensorFlow and was built with OpenAI Gym in mind.
 23 | 
 24 | ## What is included?
 25 | As of today, the following algorithms have been implemented:
 26 | 
 27 | - Deep Q Learning (DQN) [[1]](http://arxiv.org/abs/1312.5602), [[2]](http://home.uchicago.edu/~arij/journalclub/papers/2015_Mnih_et_al.pdf)
 28 | - Double DQN [[3]](http://arxiv.org/abs/1509.06461)
 29 | - Deep Deterministic Policy Gradient (DDPG) [[4]](http://arxiv.org/abs/1509.02971)
 30 | - Continuous DQN (CDQN or NAF) [[6]](http://arxiv.org/abs/1603.00748)
 31 | - Cross-Entropy Method (CEM) [[7]](http://learning.mpi-sws.org/mlss2016/slides/2016-MLSS-RL.pdf), [[8]](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.81.6579&rep=rep1&type=pdf)
 32 | - Dueling network DQN (Dueling DQN) [[9]](https://arxiv.org/abs/1511.06581)
 33 | - Deep SARSA [[10]](http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf)
 34 | 
 35 | You can find more information on each agent in the [wiki](https://github.com/matthiasplappert/keras-rl/wiki/Agent-Overview).
 36 | 
 37 | I'm currently working on the following algorithms, which can be found on the `experimental` branch:
 38 | 
 39 | - Asynchronous Advantage Actor-Critic (A3C) [[5]](http://arxiv.org/abs/1602.01783)
 40 | 
 41 | Notice that these are **only experimental** and might currently not even run.
 42 | 
 43 | ## How do I install it and how do I get started?
 44 | Installing `keras-rl` is easy. Just run the following commands and you should be good to go:
 45 | ```bash
 46 | pip install keras-rl
 47 | ```
 48 | This will install `keras-rl` and all necessary dependencies.
 49 | 
 50 | If you want to run the examples, you'll also have to install `gym` by OpenAI.
 51 | Please refer to [their installation instructions](https://github.com/openai/gym#installation).
 52 | It's quite easy and works nicely on Ubuntu and Mac OS X.
 53 | You'll also need the `h5py` package to load and save model weights, which can be installed using
 54 | the following command:
 55 | ```bash
 56 | pip install h5py
 57 | ```
 58 | 
 59 | Once you have installed everything, you can try out a simple example:
 60 | ```bash
 61 | python examples/dqn_cartpole.py
 62 | ```
 63 | This is a very simple example and it should converge relatively quickly, so it's a great way to get started!
 64 | It also visualizes the game during training, so you can watch it learn. How cool is that?
 65 | 
 66 | Unfortunately, the documentation of `keras-rl` is currently almost non-existent.
 67 | However, you can find a couple of more examples that illustrate the usage of both DQN (for tasks with discrete actions) as well as for DDPG (for tasks with continuous actions).
 68 | While these examples are not replacement for a proper documentation, they should be enough to get started quickly and to see the magic of reinforcement learning yourself.
 69 | I also encourage you to play around with other environments (OpenAI Gym has plenty) and maybe even try to find better hyperparameters for the existing ones.
 70 | 
 71 | If you have questions or problems, please file an issue or, even better, fix the problem yourself and submit a pull request!
 72 | 
 73 | ## Do I have to train the models myself?
 74 | Training times can be very long depending on the complexity of the environment.
 75 | [This repo](https://github.com/matthiasplappert/keras-rl-weights) provides some weights that were obtained by running (at least some) of the examples that are included in `keras-rl`.
 76 | You can load the weights using the `load_weights` method on the respective agents.
 77 | 
 78 | ## Requirements
 79 | - Python 2.7 or Python 3.5
 80 | - [Keras](http://keras.io) >= 1.0.7
 81 | 
 82 | That's it. However, if you want to run the examples, you'll also need the following dependencies:
 83 | - [OpenAI Gym](https://github.com/openai/gym)
 84 | - [h5py](https://pypi.python.org/pypi/h5py)
 85 | 
 86 | `keras-rl` also works with [TensorFlow](https://www.tensorflow.org/). To find out how to use TensorFlow instead of [Theano](http://deeplearning.net/software/theano/), please refer to the [Keras documentation](http://keras.io/#switching-from-theano-to-tensorflow).
 87 | 
 88 | ## Documentation
 89 | We are currently in the process of getting a proper documentation going. [The latest version of the
 90 | documentation is available online](http://keras-rl.readthedocs.org). All contributions to the
 91 | documentation are greatly appreciated!
 92 | 
 93 | ## Support
 94 | You can ask questions and join the development discussion:
 95 | 
 96 | - On the [Keras-RL Google group](https://groups.google.com/forum/#!forum/keras-rl-users).
 97 | - On the [Keras-RL Gitter channel](https://gitter.im/keras-rl/Lobby).
 98 | 
 99 | You can also post **bug reports and feature requests** (only!) in [Github issues](https://github.com/matthiasplappert/keras-rl/issues).
100 | 
101 | ## Running the Tests
102 | To run the tests locally, you'll first have to install the following dependencies:
103 | ```bash
104 | pip install pytest pytest-xdist pep8 pytest-pep8 pytest-cov python-coveralls
105 | ```
106 | You can then run all tests using this command:
107 | ```bash
108 | py.test tests/.
109 | ```
110 | If you want to check if the files conform to the PEP8 style guidelines, run the following command:
111 | ```bash
112 | py.test --pep8
113 | ```
114 | 
115 | ## Citing
116 | If you use `keras-rl` in your research, you can cite it as follows:
117 | ```bibtex
118 | @misc{plappert2016kerasrl,
119 |     author = {Matthias Plappert},
120 |     title = {keras-rl},
121 |     year = {2016},
122 |     publisher = {GitHub},
123 |     journal = {GitHub repository},
124 |     howpublished = {\url{https://github.com/matthiasplappert/keras-rl}},
125 | }
126 | ```
127 | 
128 | 
129 | ## Acknowledgments
130 | The foundation for this library was developed during my work at the [High Performance Humanoid Technologies (H²T)](https://h2t.anthropomatik.kit.edu/) lab at the [Karlsruhe Institute of Technology (KIT)](https://kit.edu).
131 | It has since been adapted to become a general-purpose library.
132 | 
133 | ## References
134 | 1. *Playing Atari with Deep Reinforcement Learning*, Mnih et al., 2013
135 | 2. *Human-level control through deep reinforcement learning*, Mnih et al., 2015
136 | 3. *Deep Reinforcement Learning with Double Q-learning*, van Hasselt et al., 2015
137 | 4. *Continuous control with deep reinforcement learning*, Lillicrap et al., 2015
138 | 5. *Asynchronous Methods for Deep Reinforcement Learning*, Mnih et al., 2016
139 | 6. *Continuous Deep Q-Learning with Model-based Acceleration*, Gu et al., 2016
140 | 7. *Learning Tetris Using the Noisy Cross-Entropy Method*, Szita et al., 2006
141 | 8. *Deep Reinforcement Learning (MLSS lecture notes)*, Schulman, 2016
142 | 9. *Dueling Network Architectures for Deep Reinforcement Learning*, Wang et al., 2016
143 | 10. *Reinforcement learning: An introduction*, Sutton and Barto, 2011
144 | 
145 | ## Todos
146 | - Documentation: Work on the documentation has begun but not everything is documented in code yet. Additionally, it would be super nice to have guides for each agents that describe the basic ideas behind it.
147 | - TRPO, priority-based memory, A3C, async DQN, ...
148 | 


--------------------------------------------------------------------------------
/docs/autogen.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """ This code and the entire documentation setup was adopted from the Keras repository:
  3 | https://github.com/fchollet/keras/blob/master/docs/autogen.py
  4 | """
  5 | from __future__ import print_function
  6 | from __future__ import unicode_literals
  7 | 
  8 | import re
  9 | import inspect
 10 | import os
 11 | import shutil
 12 | import sys
 13 | if sys.version[0] == '2':
 14 |     reload(sys)
 15 |     sys.setdefaultencoding('utf8')
 16 | 
 17 | import rl
 18 | import rl.core
 19 | import rl.processors
 20 | import rl.agents
 21 | 
 22 | 
 23 | EXCLUDE = {
 24 |     
 25 | }
 26 | 
 27 | PAGES = [
 28 |     {
 29 |         'page': 'core.md',
 30 |         'all_module_classes': [rl.core],
 31 |     },
 32 |     {
 33 |         'page': 'processors.md',
 34 |         'all_module_classes': [rl.processors],
 35 |     },
 36 |     {
 37 |         'page': 'agents/overview.md',
 38 |         'functions': [
 39 |             rl.core.Agent.fit,
 40 |             rl.core.Agent.test,
 41 |             rl.core.Agent.compile,
 42 |             rl.core.Agent.get_config,
 43 |             rl.core.Agent.reset_states,
 44 |             rl.core.Agent.load_weights,
 45 |             rl.core.Agent.save_weights,
 46 |         ],
 47 |     },
 48 |     {
 49 |         'page': 'agents/dqn.md',
 50 |         'classes': [rl.agents.DQNAgent],
 51 |     },
 52 |     {
 53 |         'page': 'agents/naf.md',
 54 |         'classes': [rl.agents.NAFAgent],
 55 |     },
 56 |     {
 57 |         'page': 'agents/ddpg.md',
 58 |         'classes': [rl.agents.DDPGAgent],
 59 |     },
 60 |     {
 61 |         'page': 'agents/sarsa.md',
 62 |         'classes': [rl.agents.SARSAAgent],
 63 |     },
 64 |     {
 65 |         'page': 'agents/cem.md',
 66 |         'classes': [rl.agents.CEMAgent],
 67 |     },
 68 | ]
 69 | 
 70 | 
 71 | ROOT_MODULE_NAME = 'rl.'
 72 | 
 73 | 
 74 | def get_earliest_class_that_defined_member(member, cls):
 75 |     ancestors = get_classes_ancestors([cls])
 76 |     result = None
 77 |     for ancestor in ancestors:
 78 |         if member in dir(ancestor):
 79 |             result = ancestor
 80 |     if not result:
 81 |         return cls
 82 |     return result
 83 | 
 84 | 
 85 | def get_classes_ancestors(classes):
 86 |     ancestors = []
 87 |     for cls in classes:
 88 |         ancestors += cls.__bases__
 89 |     filtered_ancestors = []
 90 |     for ancestor in ancestors:
 91 |         if ancestor.__name__ in ['object']:
 92 |             continue
 93 |         filtered_ancestors.append(ancestor)
 94 |     if filtered_ancestors:
 95 |         return filtered_ancestors + get_classes_ancestors(filtered_ancestors)
 96 |     else:
 97 |         return filtered_ancestors
 98 | 
 99 | 
100 | def get_function_signature(function, method=True):
101 |     signature = getattr(function, '_legacy_support_signature', None)
102 |     if signature is None:
103 |         signature = inspect.getargspec(function)
104 |     defaults = signature.defaults
105 |     if method:
106 |         args = signature.args[1:]
107 |     else:
108 |         args = signature.args
109 |     if defaults:
110 |         kwargs = zip(args[-len(defaults):], defaults)
111 |         args = args[:-len(defaults)]
112 |     else:
113 |         kwargs = []
114 |     st = '%s.%s(' % (function.__module__, function.__name__)
115 |     for a in args:
116 |         st += str(a) + ', '
117 |     for a, v in kwargs:
118 |         if isinstance(v, str):
119 |             v = '\'' + v + '\''
120 |         st += str(a) + '=' + str(v) + ', '
121 |     if kwargs or args:
122 |         return st[:-2] + ')'
123 |     else:
124 |         return st + ')'
125 | 
126 | 
127 | def get_class_signature(cls):
128 |     try:
129 |         class_signature = get_function_signature(cls.__init__)
130 |         class_signature = class_signature.replace('__init__', cls.__name__)
131 |     except:
132 |         # in case the class inherits from object and does not
133 |         # define __init__
134 |         class_signature = cls.__module__ + '.' + cls.__name__ + '()'
135 |     return class_signature
136 | 
137 | 
138 | def class_to_source_link(cls):
139 |     module_name = cls.__module__
140 |     assert module_name.startswith(ROOT_MODULE_NAME)
141 |     path = module_name.replace('.', '/')
142 |     path += '.py'
143 |     line = inspect.getsourcelines(cls)[-1]
144 |     link = 'https://github.com/matthiasplappert/keras-rl/blob/master/' + path + '#L' + str(line)
145 |     return '[[source]](' + link + ')'
146 | 
147 | 
148 | def function_to_source_link(fn):
149 |     module_name = fn.__module__
150 |     assert module_name.startswith(ROOT_MODULE_NAME)
151 |     path = module_name.replace('.', '/')
152 |     path += '.py'
153 |     line = inspect.getsourcelines(fn)[-1]
154 |     link = 'https://github.com/matthiasplappert/keras-rl/blob/master/' + path + '#L' + str(line)
155 |     return '[[source]](' + link + ')'
156 | 
157 | 
158 | def code_snippet(snippet):
159 |     result = '```python\n'
160 |     result += snippet + '\n'
161 |     result += '```\n'
162 |     return result
163 | 
164 | 
165 | def process_class_docstring(docstring):
166 |     docstring = re.sub(r'\n    # (.*)\n',
167 |                        r'\n    __\1__\n\n',
168 |                        docstring)
169 | 
170 |     docstring = re.sub(r'    ([^\s\\]+) \((.*)\n',
171 |                        r'    - __\1__ (\2\n',
172 |                        docstring)
173 | 
174 |     docstring = docstring.replace('    ' * 5, '\t\t')
175 |     docstring = docstring.replace('    ' * 3, '\t')
176 |     docstring = docstring.replace('    ', '')
177 |     return docstring
178 | 
179 | 
180 | def process_function_docstring(docstring):
181 |     docstring = re.sub(r'\n    # (.*)\n',
182 |                        r'\n    __\1__\n\n',
183 |                        docstring)
184 |     docstring = re.sub(r'\n        # (.*)\n',
185 |                        r'\n        __\1__\n\n',
186 |                        docstring)
187 | 
188 |     docstring = re.sub(r'    ([^\s\\]+) \((.*)\n',
189 |                        r'    - __\1__ (\2\n',
190 |                        docstring)
191 | 
192 |     docstring = docstring.replace('    ' * 6, '\t\t')
193 |     docstring = docstring.replace('    ' * 4, '\t')
194 |     docstring = docstring.replace('    ', '')
195 |     return docstring
196 | 
197 | print('Cleaning up existing sources directory.')
198 | if os.path.exists('sources'):
199 |     shutil.rmtree('sources')
200 | 
201 | print('Populating sources directory with templates.')
202 | for subdir, dirs, fnames in os.walk('templates'):
203 |     for fname in fnames:
204 |         new_subdir = subdir.replace('templates', 'sources')
205 |         if not os.path.exists(new_subdir):
206 |             os.makedirs(new_subdir)
207 |         if fname[-3:] == '.md':
208 |             fpath = os.path.join(subdir, fname)
209 |             new_fpath = fpath.replace('templates', 'sources')
210 |             shutil.copy(fpath, new_fpath)
211 | 
212 | # Take care of index page.
213 | readme = open('../README.md').read()
214 | index = open('templates/index.md').read()
215 | index = index.replace('{{autogenerated}}', readme[readme.find('##'):])
216 | f = open('sources/index.md', 'w')
217 | f.write(index)
218 | f.close()
219 | 
220 | print('Starting autogeneration.')
221 | for page_data in PAGES:
222 |     blocks = []
223 |     classes = page_data.get('classes', [])
224 |     for module in page_data.get('all_module_classes', []):
225 |         module_classes = []
226 |         for name in dir(module):
227 |             if name[0] == '_' or name in EXCLUDE:
228 |                 continue
229 |             module_member = getattr(module, name)
230 |             if inspect.isclass(module_member):
231 |                 cls = module_member
232 |                 if cls.__module__ == module.__name__:
233 |                     if cls not in module_classes:
234 |                         module_classes.append(cls)
235 |         module_classes.sort(key=lambda x: id(x))
236 |         classes += module_classes
237 | 
238 |     for cls in classes:
239 |         subblocks = []
240 |         signature = get_class_signature(cls)
241 |         subblocks.append('<span style="float:right;">' + class_to_source_link(cls) + '</span>')
242 |         subblocks.append('### ' + cls.__name__ + '\n')
243 |         subblocks.append(code_snippet(signature))
244 |         docstring = cls.__doc__
245 |         if docstring:
246 |             subblocks.append(process_class_docstring(docstring))
247 |         blocks.append('\n'.join(subblocks))
248 | 
249 |     functions = page_data.get('functions', [])
250 |     for module in page_data.get('all_module_functions', []):
251 |         module_functions = []
252 |         for name in dir(module):
253 |             if name[0] == '_' or name in EXCLUDE:
254 |                 continue
255 |             module_member = getattr(module, name)
256 |             if inspect.isfunction(module_member):
257 |                 function = module_member
258 |                 if module.__name__ in function.__module__:
259 |                     if function not in module_functions:
260 |                         module_functions.append(function)
261 |         module_functions.sort(key=lambda x: id(x))
262 |         functions += module_functions
263 | 
264 |     for function in functions:
265 |         subblocks = []
266 |         signature = get_function_signature(function, method=False)
267 |         signature = signature.replace(function.__module__ + '.', '')
268 |         subblocks.append('<span style="float:right;">' + function_to_source_link(function) + '</span>')
269 |         subblocks.append('### ' + function.__name__ + '\n')
270 |         subblocks.append(code_snippet(signature))
271 |         docstring = function.__doc__
272 |         if docstring:
273 |             subblocks.append(process_function_docstring(docstring))
274 |         blocks.append('\n\n'.join(subblocks))
275 | 
276 |     if not blocks:
277 |         raise RuntimeError('Found no content for page ' +
278 |                            page_data['page'])
279 | 
280 |     mkdown = '\n----\n\n'.join(blocks)
281 |     # save module page.
282 |     # Either insert content into existing page,
283 |     # or create page otherwise
284 |     page_name = page_data['page']
285 |     path = os.path.join('sources', page_name)
286 |     if os.path.exists(path):
287 |         template = open(path).read()
288 |         assert '{{autogenerated}}' in template, ('Template found for ' + path +
289 |                                                  ' but missing {{autogenerated}} tag.')
290 |         mkdown = template.replace('{{autogenerated}}', mkdown)
291 |         print('...inserting autogenerated content into template:', path)
292 |     else:
293 |         print('...creating new page with autogenerated content:', path)
294 |     subdir = os.path.dirname(path)
295 |     if not os.path.exists(subdir):
296 |         os.makedirs(subdir)
297 |     open(path, 'w').write(mkdown)
298 | 


--------------------------------------------------------------------------------
/rl/agents/sarsa.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | 
  3 | import numpy as np
  4 | 
  5 | from keras.callbacks import History
  6 | from keras.layers import Input, Lambda
  7 | import keras.backend as K
  8 | 
  9 | from rl.core import Agent
 10 | from rl.agents.dqn import mean_q
 11 | from rl.util import huber_loss
 12 | from rl.policy import EpsGreedyQPolicy, GreedyQPolicy
 13 | from rl.util import get_object_config
 14 | from rl.keras_future import Model
 15 | 
 16 | 
 17 | class SARSAAgent(Agent):
 18 |     """Write me
 19 |     """
 20 |     def __init__(self, model, nb_actions, policy=None, test_policy=None, gamma=.99, nb_steps_warmup=10,
 21 |                  train_interval=1, delta_clip=np.inf, *args, **kwargs):
 22 |         super(SarsaAgent, self).__init__(*args, **kwargs)
 23 | 
 24 |         # Do not use defaults in constructor because that would mean that each instance shares the same
 25 |         # policy.
 26 |         if policy is None:
 27 |             policy = EpsGreedyQPolicy()
 28 |         if test_policy is None:
 29 |             test_policy = GreedyQPolicy()
 30 | 
 31 |         self.model = model
 32 |         self.nb_actions = nb_actions
 33 |         self.policy = policy
 34 |         self.test_policy = test_policy
 35 |         self.gamma = gamma
 36 |         self.nb_steps_warmup = nb_steps_warmup
 37 |         self.train_interval = train_interval
 38 | 
 39 |         self.delta_clip = delta_clip
 40 |         self.compiled = False
 41 |         self.actions = None
 42 |         self.observations = None
 43 |         self.rewards = None
 44 | 
 45 |     def compute_batch_q_values(self, state_batch):
 46 |         batch = self.process_state_batch(state_batch)
 47 |         q_values = self.model.predict_on_batch(batch)
 48 |         assert q_values.shape == (len(state_batch), self.nb_actions)
 49 |         return q_values
 50 | 
 51 |     def compute_q_values(self, state):
 52 |         q_values = self.compute_batch_q_values([state]).flatten()
 53 |         assert q_values.shape == (self.nb_actions,)
 54 |         return q_values
 55 | 
 56 |     def process_state_batch(self, batch):
 57 |         batch = np.array(batch)
 58 |         if self.processor is None:
 59 |             return batch
 60 |         return self.processor.process_state_batch(batch)
 61 | 
 62 |     def get_config(self):
 63 |         config = super(SarsaAgent, self).get_config()
 64 |         config['nb_actions'] = self.nb_actions
 65 |         config['gamma'] = self.gamma
 66 |         config['nb_steps_warmup'] = self.nb_steps_warmup
 67 |         config['train_interval'] = self.train_interval
 68 |         config['delta_clip'] = self.delta_clip
 69 |         config['model'] = get_object_config(self.model)
 70 |         config['policy'] = get_object_config(self.policy)
 71 |         config['test_policy'] = get_object_config(self.test_policy)
 72 |         return config
 73 | 
 74 |     def compile(self, optimizer, metrics=[]):
 75 |         metrics += [mean_q]  # register default metrics
 76 | 
 77 |         def clipped_masked_error(args):
 78 |             y_true, y_pred, mask = args
 79 |             loss = huber_loss(y_true, y_pred, self.delta_clip)
 80 |             loss *= mask  # apply element-wise mask
 81 |             return K.sum(loss, axis=-1)
 82 | 
 83 |         # Create trainable model. The problem is that we need to mask the output since we only
 84 |         # ever want to update the Q values for a certain action. The way we achieve this is by
 85 |         # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility
 86 |         # to mask out certain parameters by passing in multiple inputs to the Lambda layer.
 87 |         y_pred = self.model.output
 88 |         y_true = Input(name='y_true', shape=(self.nb_actions,))
 89 |         mask = Input(name='mask', shape=(self.nb_actions,))
 90 |         loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='loss')([y_pred, y_true, mask])
 91 |         ins = [self.model.input] if type(self.model.input) is not list else self.model.input
 92 |         trainable_model = Model(input=ins + [y_true, mask], output=[loss_out, y_pred])
 93 |         assert len(trainable_model.output_names) == 2
 94 |         combined_metrics = {trainable_model.output_names[1]: metrics}
 95 |         losses = [
 96 |             lambda y_true, y_pred: y_pred,  # loss is computed in Lambda layer
 97 |             lambda y_true, y_pred: K.zeros_like(y_pred),  # we only include this for the metrics
 98 |         ]
 99 |         trainable_model.compile(optimizer=optimizer, loss=losses, metrics=combined_metrics)
100 |         self.trainable_model = trainable_model
101 | 
102 |         self.compiled = True
103 | 
104 |     def load_weights(self, filepath):
105 |         self.model.load_weights(filepath)
106 | 
107 |     def save_weights(self, filepath, overwrite=False):
108 |         self.model.save_weights(filepath, overwrite=overwrite)
109 | 
110 |     def reset_states(self):
111 |         self.actions = collections.deque(maxlen=2)
112 |         self.observations = collections.deque(maxlen=2)
113 |         self.rewards = collections.deque(maxlen=2)
114 |         if self.compiled:
115 |             self.model.reset_states()
116 | 
117 |     def forward(self, observation):
118 |         # Select an action.
119 |         q_values = self.compute_q_values([observation])
120 |         if self.training:
121 |             action = self.policy.select_action(q_values=q_values)
122 |         else:
123 |             action = self.test_policy.select_action(q_values=q_values)
124 |         if self.processor is not None:
125 |             action = self.processor.process_action(action)
126 | 
127 |         # Book-keeping.
128 |         self.observations.append(observation)
129 |         self.actions.append(action)
130 | 
131 |         return action
132 | 
133 |     def backward(self, reward, terminal):
134 |         metrics = [np.nan for _ in self.metrics_names]
135 |         if not self.training:
136 |             # We're done here. No need to update the experience memory since we only use the working
137 |             # memory to obtain the state over the most recent observations.
138 |             return metrics
139 | 
140 |         # Train the network on a single stochastic batch.
141 |         if self.step > self.nb_steps_warmup and self.step % self.train_interval == 0:
142 |             # Start by extracting the necessary parameters (we use a vectorized implementation).
143 |             self.rewards.append(reward)
144 |             if len(self.observations) < 2:
145 |                 return metrics  # not enough data yet
146 | 
147 |             state0_batch = [self.observations[0]]
148 |             reward_batch = [self.rewards[0]]
149 |             action_batch = [self.actions[0]]
150 |             terminal1_batch = [0.] if terminal else [1.]
151 |             state1_batch = [self.observations[1]]
152 |             action1_batch = [self.actions[1]]
153 | 
154 |             # Prepare and validate parameters.
155 |             state0_batch = self.process_state_batch(state0_batch)
156 |             state1_batch = self.process_state_batch(state1_batch)
157 |             terminal1_batch = np.array(terminal1_batch)
158 |             reward_batch = np.array(reward_batch)
159 |             assert reward_batch.shape == (1,)
160 |             assert terminal1_batch.shape == reward_batch.shape
161 |             assert len(action_batch) == len(reward_batch)
162 | 
163 |             batch = self.process_state_batch(state1_batch)
164 |             q_values = self.compute_q_values(batch)
165 |             q_values = q_values.reshape((1, self.nb_actions))
166 | 
167 |             q_batch = q_values[0, action1_batch]
168 | 
169 |             assert q_batch.shape == (1,)
170 |             targets = np.zeros((1, self.nb_actions))
171 |             dummy_targets = np.zeros((1,))
172 |             masks = np.zeros((1, self.nb_actions))
173 | 
174 |             # Compute r_t + gamma * Q(s_t+1, a_t+1)
175 |             discounted_reward_batch = self.gamma * q_batch
176 |             # Set discounted reward to zero for all states that were terminal.
177 |             discounted_reward_batch *= terminal1_batch
178 |             assert discounted_reward_batch.shape == reward_batch.shape
179 |             Rs = reward_batch + discounted_reward_batch
180 |             for idx, (target, mask, R, action) in enumerate(zip(targets, masks, Rs, action_batch)):
181 |                 target[action] = R  # update action with estimated accumulated reward
182 |                 dummy_targets[idx] = R
183 |                 mask[action] = 1.  # enable loss for this specific action
184 |             targets = np.array(targets).astype('float32')
185 |             masks = np.array(masks).astype('float32')
186 | 
187 |             # Finally, perform a single update on the entire batch. We use a dummy target since
188 |             # the actual loss is computed in a Lambda layer that needs more complex input. However,
189 |             # it is still useful to know the actual target to compute metrics properly.
190 |             state0_batch = state0_batch.reshape((1,) + state0_batch.shape)
191 |             ins = [state0_batch] if type(self.model.input) is not list else state0_batch
192 |             metrics = self.trainable_model.train_on_batch(ins + [targets, masks], [dummy_targets, targets])
193 |             metrics = [metric for idx, metric in enumerate(metrics) if idx not in (1, 2)]  # throw away individual losses
194 |             metrics += self.policy.metrics
195 |             if self.processor is not None:
196 |                 metrics += self.processor.metrics
197 |         return metrics
198 | 
199 |     @property
200 |     def layers(self):
201 |         return self.model.layers[:]
202 | 
203 |     @property
204 |     def metrics_names(self):
205 |         # Throw away individual losses and replace output name since this is hidden from the user.
206 |         assert len(self.trainable_model.output_names) == 2
207 |         dummy_output_name = self.trainable_model.output_names[1]
208 |         model_metrics = [name for idx, name in enumerate(self.trainable_model.metrics_names) if idx not in (1, 2)]
209 |         model_metrics = [name.replace(dummy_output_name + '_', '') for name in model_metrics]
210 | 
211 |         names = model_metrics + self.policy.metrics_names[:]
212 |         if self.processor is not None:
213 |             names += self.processor.metrics_names[:]
214 |         return names
215 | 
216 |     @property
217 |     def policy(self):
218 |         return self.__policy
219 | 
220 |     @policy.setter
221 |     def policy(self, policy):
222 |         self.__policy = policy
223 |         self.__policy._set_agent(self)
224 | 
225 |     @property
226 |     def test_policy(self):
227 |         return self.__test_policy
228 | 
229 |     @test_policy.setter
230 |     def test_policy(self, policy):
231 |         self.__test_policy = policy
232 |         self.__test_policy._set_agent(self)
233 | 
234 | # Aliases
235 | SarsaAgent = SARSAAgent
236 | 


--------------------------------------------------------------------------------
/rl/memory.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from collections import deque, namedtuple
  3 | import warnings
  4 | import random
  5 | 
  6 | import numpy as np
  7 | 
  8 | 
  9 | # This is to be understood as a transition: Given `state0`, performing `action`
 10 | # yields `reward` and results in `state1`, which might be `terminal`.
 11 | Experience = namedtuple('Experience', 'state0, action, reward, state1, terminal1')
 12 | 
 13 | 
 14 | def sample_batch_indexes(low, high, size):
 15 |     if high - low >= size:
 16 |         # We have enough data. Draw without replacement, that is each index is unique in the
 17 |         # batch. We cannot use `np.random.choice` here because it is horribly inefficient as
 18 |         # the memory grows. See https://github.com/numpy/numpy/issues/2764 for a discussion.
 19 |         # `random.sample` does the same thing (drawing without replacement) and is way faster.
 20 |         try:
 21 |             r = xrange(low, high)
 22 |         except NameError:
 23 |             r = range(low, high)
 24 |         batch_idxs = random.sample(r, size)
 25 |     else:
 26 |         # Not enough data. Help ourselves with sampling from the range, but the same index
 27 |         # can occur multiple times. This is not good and should be avoided by picking a
 28 |         # large enough warm-up phase.
 29 |         warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!')
 30 |         batch_idxs = np.random.random_integers(low, high - 1, size=size)
 31 |     assert len(batch_idxs) == size
 32 |     return batch_idxs
 33 | 
 34 | 
 35 | class RingBuffer(object):
 36 |     def __init__(self, maxlen):
 37 |         self.maxlen = maxlen
 38 |         self.start = 0
 39 |         self.length = 0
 40 |         self.data = [None for _ in range(maxlen)]
 41 | 
 42 |     def __len__(self):
 43 |         return self.length
 44 | 
 45 |     def __getitem__(self, idx):
 46 |         if idx < 0 or idx >= self.length:
 47 |             raise KeyError()
 48 |         return self.data[(self.start + idx) % self.maxlen]
 49 | 
 50 |     def append(self, v):
 51 |         if self.length < self.maxlen:
 52 |             # We have space, simply increase the length.
 53 |             self.length += 1
 54 |         elif self.length == self.maxlen:
 55 |             # No space, "remove" the first item.
 56 |             self.start = (self.start + 1) % self.maxlen
 57 |         else:
 58 |             # This should never happen.
 59 |             raise RuntimeError()
 60 |         self.data[(self.start + self.length - 1) % self.maxlen] = v
 61 | 
 62 | 
 63 | def zeroed_observation(observation):
 64 |     if hasattr(observation, 'shape'):
 65 |         return np.zeros(observation.shape)
 66 |     elif hasattr(observation, '__iter__'):
 67 |         out = []
 68 |         for x in observation:
 69 |             out.append(zeroed_observation(x))
 70 |         return out
 71 |     else:
 72 |         return 0.
 73 | 
 74 | 
 75 | class Memory(object):
 76 |     def __init__(self, window_length, ignore_episode_boundaries=False):
 77 |         self.window_length = window_length
 78 |         self.ignore_episode_boundaries = ignore_episode_boundaries
 79 | 
 80 |         self.recent_observations = deque(maxlen=window_length)
 81 |         self.recent_terminals = deque(maxlen=window_length)
 82 | 
 83 |     def sample(self, batch_size, batch_idxs=None):
 84 |         raise NotImplementedError()
 85 | 
 86 |     def append(self, observation, action, reward, terminal, training=True):
 87 |         self.recent_observations.append(observation)
 88 |         self.recent_terminals.append(terminal)
 89 | 
 90 |     def get_recent_state(self, current_observation):
 91 |         # This code is slightly complicated by the fact that subsequent observations might be
 92 |         # from different episodes. We ensure that an experience never spans multiple episodes.
 93 |         # This is probably not that important in practice but it seems cleaner.
 94 |         state = [current_observation]
 95 |         idx = len(self.recent_observations) - 1
 96 |         for offset in range(0, self.window_length - 1):
 97 |             current_idx = idx - offset
 98 |             current_terminal = self.recent_terminals[current_idx - 1] if current_idx - 1 >= 0 else False
 99 |             if current_idx < 0 or (not self.ignore_episode_boundaries and current_terminal):
100 |                 # The previously handled observation was terminal, don't add the current one.
101 |                 # Otherwise we would leak into a different episode.
102 |                 break
103 |             state.insert(0, self.recent_observations[current_idx])
104 |         while len(state) < self.window_length:
105 |             state.insert(0, zeroed_observation(state[0]))
106 |         return state
107 | 
108 |     def get_config(self):
109 |         config = {
110 |             'window_length': self.window_length,
111 |             'ignore_episode_boundaries': self.ignore_episode_boundaries,
112 |         }
113 |         return config
114 | 
115 | class SequentialMemory(Memory):
116 |     def __init__(self, limit, **kwargs):
117 |         super(SequentialMemory, self).__init__(**kwargs)
118 |         
119 |         self.limit = limit
120 | 
121 |         # Do not use deque to implement the memory. This data structure may seem convenient but
122 |         # it is way too slow on random access. Instead, we use our own ring buffer implementation.
123 |         self.actions = RingBuffer(limit)
124 |         self.rewards = RingBuffer(limit)
125 |         self.terminals = RingBuffer(limit)
126 |         self.observations = RingBuffer(limit)
127 | 
128 |     def sample(self, batch_size, batch_idxs=None):
129 |         if batch_idxs is None:
130 |             # Draw random indexes such that we have at least a single entry before each
131 |             # index.
132 |             batch_idxs = sample_batch_indexes(0, self.nb_entries - 1, size=batch_size)
133 |         batch_idxs = np.array(batch_idxs) + 1
134 |         assert np.min(batch_idxs) >= 1
135 |         assert np.max(batch_idxs) < self.nb_entries
136 |         assert len(batch_idxs) == batch_size
137 | 
138 |         # Create experiences
139 |         experiences = []
140 |         for idx in batch_idxs:
141 |             terminal0 = self.terminals[idx - 2] if idx >= 2 else False
142 |             while terminal0:
143 |                 # Skip this transition because the environment was reset here. Select a new, random
144 |                 # transition and use this instead. This may cause the batch to contain the same
145 |                 # transition twice.
146 |                 idx = sample_batch_indexes(1, self.nb_entries, size=1)[0]
147 |                 terminal0 = self.terminals[idx - 2] if idx >= 2 else False
148 |             assert 1 <= idx < self.nb_entries
149 | 
150 |             # This code is slightly complicated by the fact that subsequent observations might be
151 |             # from different episodes. We ensure that an experience never spans multiple episodes.
152 |             # This is probably not that important in practice but it seems cleaner.
153 |             state0 = [self.observations[idx - 1]]
154 |             for offset in range(0, self.window_length - 1):
155 |                 current_idx = idx - 2 - offset
156 |                 current_terminal = self.terminals[current_idx - 1] if current_idx - 1 > 0 else False
157 |                 if current_idx < 0 or (not self.ignore_episode_boundaries and current_terminal):
158 |                     # The previously handled observation was terminal, don't add the current one.
159 |                     # Otherwise we would leak into a different episode.
160 |                     break
161 |                 state0.insert(0, self.observations[current_idx])
162 |             while len(state0) < self.window_length:
163 |                 state0.insert(0, zeroed_observation(state0[0]))
164 |             action = self.actions[idx - 1]
165 |             reward = self.rewards[idx - 1]
166 |             terminal1 = self.terminals[idx - 1]
167 | 
168 |             # Okay, now we need to create the follow-up state. This is state0 shifted on timestep
169 |             # to the right. Again, we need to be careful to not include an observation from the next
170 |             # episode if the last state is terminal.
171 |             state1 = [np.copy(x) for x in state0[1:]]
172 |             state1.append(self.observations[idx])
173 | 
174 |             assert len(state0) == self.window_length
175 |             assert len(state1) == len(state0)
176 |             experiences.append(Experience(state0=state0, action=action, reward=reward,
177 |                                           state1=state1, terminal1=terminal1))
178 |         assert len(experiences) == batch_size
179 |         return experiences
180 | 
181 |     def append(self, observation, action, reward, terminal, training=True):
182 |         super(SequentialMemory, self).append(observation, action, reward, terminal, training=training)
183 |         
184 |         # This needs to be understood as follows: in `observation`, take `action`, obtain `reward`
185 |         # and weather the next state is `terminal` or not.
186 |         if training:
187 |             self.observations.append(observation)
188 |             self.actions.append(action)
189 |             self.rewards.append(reward)
190 |             self.terminals.append(terminal)
191 | 
192 |     @property
193 |     def nb_entries(self):
194 |         return len(self.observations)
195 | 
196 |     def get_config(self):
197 |         config = super(SequentialMemory, self).get_config()
198 |         config['limit'] = self.limit
199 |         return config
200 | 
201 | 
202 | class EpisodeParameterMemory(Memory):
203 |     def __init__(self, limit, **kwargs):
204 |         super(EpisodeParameterMemory, self).__init__(**kwargs)
205 |         self.limit = limit
206 | 
207 |         self.params = RingBuffer(limit)
208 |         self.intermediate_rewards = []
209 |         self.total_rewards = RingBuffer(limit)
210 | 
211 |     def sample(self, batch_size, batch_idxs=None):
212 |         if batch_idxs is None:
213 |             batch_idxs = sample_batch_indexes(0, self.nb_entries, size=batch_size)
214 |         assert len(batch_idxs) == batch_size
215 | 
216 |         batch_params = []
217 |         batch_total_rewards = []
218 |         for idx in batch_idxs:
219 |             batch_params.append(self.params[idx])
220 |             batch_total_rewards.append(self.total_rewards[idx])
221 |         return batch_params, batch_total_rewards
222 | 
223 |     def append(self, observation, action, reward, terminal, training=True):
224 |         super(EpisodeParameterMemory, self).append(observation, action, reward, terminal, training=training)
225 |         if training:
226 |             self.intermediate_rewards.append(reward)
227 | 
228 |     def finalize_episode(self, params):
229 |         total_reward = sum(self.intermediate_rewards)
230 |         self.total_rewards.append(total_reward)
231 |         self.params.append(params)
232 |         self.intermediate_rewards = []
233 | 
234 |     @property
235 |     def nb_entries(self):
236 |         return len(self.total_rewards)
237 | 
238 |     def get_config(self):
239 |         config = super(SequentialMemory, self).get_config()
240 |         config['limit'] = self.limit
241 |         return config
242 | 


--------------------------------------------------------------------------------
/tests/rl/test_memory.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import pytest
  3 | import numpy as np
  4 | from numpy.testing import assert_allclose
  5 | 
  6 | from rl.memory import SequentialMemory, RingBuffer
  7 | 
  8 | 
  9 | def test_ring_buffer():
 10 |     def assert_elements(b, ref):
 11 |         assert len(b) == len(ref)
 12 |         for idx in range(b.maxlen):
 13 |             if idx >= len(ref):
 14 |                 with pytest.raises(KeyError):
 15 |                     b[idx]
 16 |             else:
 17 |                 assert b[idx] == ref[idx]
 18 | 
 19 |     b = RingBuffer(5)
 20 | 
 21 |     # Fill buffer.
 22 |     assert_elements(b, [])
 23 |     b.append(1)
 24 |     assert_elements(b, [1])
 25 |     b.append(2)
 26 |     assert_elements(b, [1, 2])
 27 |     b.append(3)
 28 |     assert_elements(b, [1, 2, 3])
 29 |     b.append(4)
 30 |     assert_elements(b, [1, 2, 3, 4])
 31 |     b.append(5)
 32 |     assert_elements(b, [1, 2, 3, 4, 5])
 33 | 
 34 |     # Add couple more items with buffer at limit.
 35 |     b.append(6)
 36 |     assert_elements(b, [2, 3, 4, 5, 6])
 37 |     b.append(7)
 38 |     assert_elements(b, [3, 4, 5, 6, 7])
 39 |     b.append(8)
 40 |     assert_elements(b, [4, 5, 6, 7, 8])
 41 | 
 42 | 
 43 | def test_get_recent_state_with_episode_boundaries():
 44 |     memory = SequentialMemory(3, window_length=2, ignore_episode_boundaries=False)
 45 |     obs_size = (3, 4)
 46 |     
 47 |     obs0 = np.random.random(obs_size)
 48 |     terminal0 = False
 49 | 
 50 |     obs1 = np.random.random(obs_size)
 51 |     terminal1 = False
 52 | 
 53 |     obs2 = np.random.random(obs_size)
 54 |     terminal2 = False
 55 | 
 56 |     obs3 = np.random.random(obs_size)
 57 |     terminal3 = True
 58 | 
 59 |     obs4 = np.random.random(obs_size)
 60 |     terminal4 = False
 61 | 
 62 |     obs5 = np.random.random(obs_size)
 63 |     terminal5 = True
 64 | 
 65 |     obs6 = np.random.random(obs_size)
 66 |     terminal6 = False
 67 | 
 68 |     state = np.array(memory.get_recent_state(obs0))
 69 |     assert state.shape == (2,) + obs_size
 70 |     assert np.allclose(state[0], 0.)
 71 |     assert np.all(state[1] == obs0)
 72 | 
 73 |     # memory.append takes the current observation, the reward after taking an action and if
 74 |     # the *new* observation is terminal, thus `obs0` and `terminal1` is correct.
 75 |     memory.append(obs0, 0, 0., terminal1)
 76 |     state = np.array(memory.get_recent_state(obs1))
 77 |     assert state.shape == (2,) + obs_size
 78 |     assert np.all(state[0] == obs0)
 79 |     assert np.all(state[1] == obs1)
 80 | 
 81 |     memory.append(obs1, 0, 0., terminal2)
 82 |     state = np.array(memory.get_recent_state(obs2))
 83 |     assert state.shape == (2,) + obs_size
 84 |     assert np.all(state[0] == obs1)
 85 |     assert np.all(state[1] == obs2)
 86 | 
 87 |     memory.append(obs2, 0, 0., terminal3)
 88 |     state = np.array(memory.get_recent_state(obs3))
 89 |     assert state.shape == (2,) + obs_size
 90 |     assert np.all(state[0] == obs2)
 91 |     assert np.all(state[1] == obs3)
 92 | 
 93 |     memory.append(obs3, 0, 0., terminal4)
 94 |     state = np.array(memory.get_recent_state(obs4))
 95 |     assert state.shape == (2,) + obs_size
 96 |     assert np.all(state[0] == np.zeros(obs_size))
 97 |     assert np.all(state[1] == obs4)
 98 | 
 99 |     memory.append(obs4, 0, 0., terminal5)
100 |     state = np.array(memory.get_recent_state(obs5))
101 |     assert state.shape == (2,) + obs_size
102 |     assert np.all(state[0] == obs4)
103 |     assert np.all(state[1] == obs5)
104 | 
105 |     memory.append(obs5, 0, 0., terminal6)
106 |     state = np.array(memory.get_recent_state(obs6))
107 |     assert state.shape == (2,) + obs_size
108 |     assert np.all(state[0] == np.zeros(obs_size))
109 |     assert np.all(state[1] == obs6)
110 | 
111 | 
112 | def test_training_flag():
113 |     obs_size = (3, 4)
114 | 
115 |     obs0 = np.random.random(obs_size)
116 |     terminal0 = False
117 | 
118 |     obs1 = np.random.random(obs_size)
119 |     terminal1 = True
120 | 
121 |     obs2 = np.random.random(obs_size)
122 |     terminal2 = False
123 | 
124 |     for training in (True, False):
125 |         memory = SequentialMemory(3, window_length=2)
126 | 
127 |         state = np.array(memory.get_recent_state(obs0))
128 |         assert state.shape == (2,) + obs_size
129 |         assert np.allclose(state[0], 0.)
130 |         assert np.all(state[1] == obs0)
131 |         assert memory.nb_entries == 0
132 |         
133 |         memory.append(obs0, 0, 0., terminal1, training=training)
134 |         state = np.array(memory.get_recent_state(obs1))
135 |         assert state.shape == (2,) + obs_size
136 |         assert np.all(state[0] == obs0)
137 |         assert np.all(state[1] == obs1)
138 |         if training:
139 |             assert memory.nb_entries == 1
140 |         else:
141 |             assert memory.nb_entries == 0
142 | 
143 |         memory.append(obs1, 0, 0., terminal2, training=training)
144 |         state = np.array(memory.get_recent_state(obs2))
145 |         assert state.shape == (2,) + obs_size
146 |         assert np.allclose(state[0], 0.)
147 |         assert np.all(state[1] == obs2)
148 |         if training:
149 |             assert memory.nb_entries == 2
150 |         else:
151 |             assert memory.nb_entries == 0
152 | 
153 | 
154 | def test_get_recent_state_without_episode_boundaries():
155 |     memory = SequentialMemory(3, window_length=2, ignore_episode_boundaries=True)
156 |     obs_size = (3, 4)
157 |     
158 |     obs0 = np.random.random(obs_size)
159 |     terminal0 = False
160 |     
161 |     obs1 = np.random.random(obs_size)
162 |     terminal1 = False
163 |     
164 |     obs2 = np.random.random(obs_size)
165 |     terminal2 = False
166 |     
167 |     obs3 = np.random.random(obs_size)
168 |     terminal3 = True
169 | 
170 |     obs4 = np.random.random(obs_size)
171 |     terminal4 = False
172 | 
173 |     obs5 = np.random.random(obs_size)
174 |     terminal5 = True
175 | 
176 |     obs6 = np.random.random(obs_size)
177 |     terminal6 = False
178 |     
179 |     state = np.array(memory.get_recent_state(obs0))
180 |     assert state.shape == (2,) + obs_size
181 |     assert np.allclose(state[0], 0.)
182 |     assert np.all(state[1] == obs0)
183 | 
184 |     # memory.append takes the current observation, the reward after taking an action and if
185 |     # the *new* observation is terminal, thus `obs0` and `terminal1` is correct.
186 |     memory.append(obs0, 0, 0., terminal1)
187 |     state = np.array(memory.get_recent_state(obs1))
188 |     assert state.shape == (2,) + obs_size
189 |     assert np.all(state[0] == obs0)
190 |     assert np.all(state[1] == obs1)
191 | 
192 |     memory.append(obs1, 0, 0., terminal2)
193 |     state = np.array(memory.get_recent_state(obs2))
194 |     assert state.shape == (2,) + obs_size
195 |     assert np.all(state[0] == obs1)
196 |     assert np.all(state[1] == obs2)
197 | 
198 |     memory.append(obs2, 0, 0., terminal3)
199 |     state = np.array(memory.get_recent_state(obs3))
200 |     assert state.shape == (2,) + obs_size
201 |     assert np.all(state[0] == obs2)
202 |     assert np.all(state[1] == obs3)
203 | 
204 |     memory.append(obs3, 0, 0., terminal4)
205 |     state = np.array(memory.get_recent_state(obs4))
206 |     assert state.shape == (2,) + obs_size
207 |     assert np.all(state[0] == obs3)
208 |     assert np.all(state[1] == obs4)
209 | 
210 |     memory.append(obs4, 0, 0., terminal5)
211 |     state = np.array(memory.get_recent_state(obs5))
212 |     assert state.shape == (2,) + obs_size
213 |     assert np.all(state[0] == obs4)
214 |     assert np.all(state[1] == obs5)
215 | 
216 |     memory.append(obs5, 0, 0., terminal6)
217 |     state = np.array(memory.get_recent_state(obs6))
218 |     assert state.shape == (2,) + obs_size
219 |     assert np.all(state[0] == obs5)
220 |     assert np.all(state[1] == obs6)
221 | 
222 | 
223 | def test_sampling():
224 |     memory = SequentialMemory(100, window_length=2, ignore_episode_boundaries=False)
225 |     obs_size = (3, 4)
226 |     actions = range(5)
227 |     
228 |     obs0 = np.random.random(obs_size)
229 |     terminal0 = False
230 |     action0 = np.random.choice(actions)
231 |     reward0 = np.random.random()
232 |     
233 |     obs1 = np.random.random(obs_size)
234 |     terminal1 = False
235 |     action1 = np.random.choice(actions)
236 |     reward1 = np.random.random()
237 |     
238 |     obs2 = np.random.random(obs_size)
239 |     terminal2 = False
240 |     action2 = np.random.choice(actions)
241 |     reward2 = np.random.random()
242 |     
243 |     obs3 = np.random.random(obs_size)
244 |     terminal3 = True
245 |     action3 = np.random.choice(actions)
246 |     reward3 = np.random.random()
247 | 
248 |     obs4 = np.random.random(obs_size)
249 |     terminal4 = False
250 |     action4 = np.random.choice(actions)
251 |     reward4 = np.random.random()
252 | 
253 |     obs5 = np.random.random(obs_size)
254 |     terminal5 = False
255 |     action5 = np.random.choice(actions)
256 |     reward5 = np.random.random()
257 | 
258 |     obs6 = np.random.random(obs_size)
259 |     terminal6 = False
260 |     action6 = np.random.choice(actions)
261 |     reward6 = np.random.random()
262 |     
263 |     # memory.append takes the current observation, the reward after taking an action and if
264 |     # the *new* observation is terminal, thus `obs0` and `terminal1` is correct.
265 |     memory.append(obs0, action0, reward0, terminal1)
266 |     memory.append(obs1, action1, reward1, terminal2)
267 |     memory.append(obs2, action2, reward2, terminal3)
268 |     memory.append(obs3, action3, reward3, terminal4)
269 |     memory.append(obs4, action4, reward4, terminal5)
270 |     memory.append(obs5, action5, reward5, terminal6)
271 |     assert memory.nb_entries == 6
272 | 
273 |     experiences = memory.sample(batch_size=5, batch_idxs=[0, 1, 2, 3, 4])
274 |     assert len(experiences) == 5
275 | 
276 |     assert_allclose(experiences[0].state0, np.array([np.zeros(obs_size), obs0]))
277 |     assert_allclose(experiences[0].state1, np.array([obs0, obs1]))
278 |     assert experiences[0].action == action0
279 |     assert experiences[0].reward == reward0
280 |     assert experiences[0].terminal1 is False
281 | 
282 |     assert_allclose(experiences[1].state0, np.array([obs0, obs1]))
283 |     assert_allclose(experiences[1].state1, np.array([obs1, obs2]))
284 |     assert experiences[1].action == action1
285 |     assert experiences[1].reward == reward1
286 |     assert experiences[1].terminal1 is False
287 | 
288 |     assert_allclose(experiences[2].state0, np.array([obs1, obs2]))
289 |     assert_allclose(experiences[2].state1, np.array([obs2, obs3]))
290 |     assert experiences[2].action == action2
291 |     assert experiences[2].reward == reward2
292 |     assert experiences[2].terminal1 is True
293 | 
294 |     # Next experience has been re-sampled since since state0 would be terminal in which case we
295 |     # cannot really have a meaningful transition because the environment gets reset. We thus
296 |     # just ensure that state0 is not terminal.
297 |     assert not np.all(experiences[3].state0 == np.array([obs2, obs3]))
298 | 
299 |     assert_allclose(experiences[4].state0, np.array([np.zeros(obs_size), obs4]))
300 |     assert_allclose(experiences[4].state1, np.array([obs4, obs5]))
301 |     assert experiences[4].action == action4
302 |     assert experiences[4].reward == reward4
303 |     assert experiences[4].terminal1 is False
304 |     
305 | 
306 | if __name__ == '__main__':
307 |     pytest.main([__file__])
308 | 


--------------------------------------------------------------------------------
/rl/callbacks.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | import warnings
  4 | import timeit
  5 | import json
  6 | from tempfile import mkdtemp
  7 | 
  8 | import numpy as np
  9 | 
 10 | from keras.callbacks import Callback as KerasCallback, CallbackList as KerasCallbackList
 11 | from keras.utils.generic_utils import Progbar
 12 | 
 13 | 
 14 | class Callback(KerasCallback):
 15 |     def _set_env(self, env):
 16 |         self.env = env
 17 | 
 18 |     def on_episode_begin(self, episode, logs={}):
 19 |         pass
 20 | 
 21 |     def on_episode_end(self, episode, logs={}):
 22 |         pass
 23 | 
 24 |     def on_step_begin(self, step, logs={}):
 25 |         pass
 26 | 
 27 |     def on_step_end(self, step, logs={}):
 28 |         pass
 29 | 
 30 |     def on_action_begin(self, action, logs={}):
 31 |         pass
 32 | 
 33 |     def on_action_end(self, action, logs={}):
 34 |         pass
 35 | 
 36 | 
 37 | class CallbackList(KerasCallbackList):
 38 |     def _set_env(self, env):
 39 |         for callback in self.callbacks:
 40 |             if callable(getattr(callback, '_set_env', None)):
 41 |                 callback._set_env(env)
 42 | 
 43 |     def on_episode_begin(self, episode, logs={}):
 44 |         for callback in self.callbacks:
 45 |             # Check if callback supports the more appropriate `on_episode_begin` callback.
 46 |             # If not, fall back to `on_epoch_begin` to be compatible with built-in Keras callbacks.
 47 |             if callable(getattr(callback, 'on_episode_begin', None)):
 48 |                 callback.on_episode_begin(episode, logs=logs)
 49 |             else:
 50 |                 callback.on_epoch_begin(episode, logs=logs)
 51 | 
 52 |     def on_episode_end(self, episode, logs={}):
 53 |         for callback in self.callbacks:
 54 |             # Check if callback supports the more appropriate `on_episode_end` callback.
 55 |             # If not, fall back to `on_epoch_end` to be compatible with built-in Keras callbacks.
 56 |             if callable(getattr(callback, 'on_episode_end', None)):
 57 |                 callback.on_episode_end(episode, logs=logs)
 58 |             else:
 59 |                 callback.on_epoch_end(episode, logs=logs)
 60 | 
 61 |     def on_step_begin(self, step, logs={}):
 62 |         for callback in self.callbacks:
 63 |             # Check if callback supports the more appropriate `on_step_begin` callback.
 64 |             # If not, fall back to `on_batch_begin` to be compatible with built-in Keras callbacks.
 65 |             if callable(getattr(callback, 'on_step_begin', None)):
 66 |                 callback.on_step_begin(step, logs=logs)
 67 |             else:
 68 |                 callback.on_batch_begin(step, logs=logs)
 69 | 
 70 |     def on_step_end(self, step, logs={}):
 71 |         for callback in self.callbacks:
 72 |             # Check if callback supports the more appropriate `on_step_end` callback.
 73 |             # If not, fall back to `on_batch_end` to be compatible with built-in Keras callbacks.
 74 |             if callable(getattr(callback, 'on_step_end', None)):
 75 |                 callback.on_step_end(step, logs=logs)
 76 |             else:
 77 |                 callback.on_batch_end(step, logs=logs)
 78 | 
 79 |     def on_action_begin(self, action, logs={}):
 80 |         for callback in self.callbacks:
 81 |             if callable(getattr(callback, 'on_action_begin', None)):
 82 |                 callback.on_action_begin(action, logs=logs)
 83 | 
 84 |     def on_action_end(self, action, logs={}):
 85 |         for callback in self.callbacks:
 86 |             if callable(getattr(callback, 'on_action_end', None)):
 87 |                 callback.on_action_end(action, logs=logs)
 88 | 
 89 | 
 90 | class TestLogger(Callback):
 91 |     def on_train_begin(self, logs):
 92 |         print('Testing for {} episodes ...'.format(self.params['nb_episodes']))
 93 | 
 94 |     def on_episode_end(self, episode, logs):
 95 |         template = 'Episode {0}: reward: {1:.3f}, steps: {2}'
 96 |         variables = [
 97 |             episode + 1,
 98 |             logs['episode_reward'],
 99 |             logs['nb_steps'],
100 |         ]
101 |         print(template.format(*variables))
102 | 
103 | 
104 | class TrainEpisodeLogger(Callback):
105 |     def __init__(self):
106 |         # Some algorithms compute multiple episodes at once since they are multi-threaded.
107 |         # We therefore use a dictionary that is indexed by the episode to separate episodes
108 |         # from each other.
109 |         self.episode_start = {}
110 |         self.observations = {}
111 |         self.rewards = {}
112 |         self.actions = {}
113 |         self.metrics = {}
114 |         self.step = 0
115 | 
116 |     def on_train_begin(self, logs):
117 |         self.train_start = timeit.default_timer()
118 |         self.metrics_names = self.model.metrics_names
119 |         print('Training for {} steps ...'.format(self.params['nb_steps']))
120 |         
121 |     def on_train_end(self, logs):
122 |         duration = timeit.default_timer() - self.train_start
123 |         print('done, took {:.3f} seconds'.format(duration))
124 | 
125 |     def on_episode_begin(self, episode, logs):
126 |         self.episode_start[episode] = timeit.default_timer()
127 |         self.observations[episode] = []
128 |         self.rewards[episode] = []
129 |         self.actions[episode] = []
130 |         self.metrics[episode] = []
131 | 
132 |     def on_episode_end(self, episode, logs):
133 |         duration = timeit.default_timer() - self.episode_start[episode]
134 |         episode_steps = len(self.observations[episode])
135 | 
136 |         # Format all metrics.
137 |         metrics = np.array(self.metrics[episode])
138 |         metrics_template = ''
139 |         metrics_variables = []
140 |         with warnings.catch_warnings():
141 |             warnings.filterwarnings('error')
142 |             for idx, name in enumerate(self.metrics_names):
143 |                 if idx > 0:
144 |                     metrics_template += ', '
145 |                 try:
146 |                     value = np.nanmean(metrics[:, idx])
147 |                     metrics_template += '{}: {:f}'
148 |                 except Warning:
149 |                     value = '--'
150 |                     metrics_template += '{}: {}'
151 |                 metrics_variables += [name, value]          
152 |         metrics_text = metrics_template.format(*metrics_variables)
153 | 
154 |         nb_step_digits = str(int(np.ceil(np.log10(self.params['nb_steps']))) + 1)
155 |         template = '{step: ' + nb_step_digits + 'd}/{nb_steps}: episode: {episode}, duration: {duration:.3f}s, episode steps: {episode_steps}, steps per second: {sps:.0f}, episode reward: {episode_reward:.3f}, mean reward: {reward_mean:.3f} [{reward_min:.3f}, {reward_max:.3f}], mean action: {action_mean:.3f} [{action_min:.3f}, {action_max:.3f}], mean observation: {obs_mean:.3f} [{obs_min:.3f}, {obs_max:.3f}], {metrics}'
156 |         variables = {
157 |             'step': self.step,
158 |             'nb_steps': self.params['nb_steps'],
159 |             'episode': episode + 1,
160 |             'duration': duration,
161 |             'episode_steps': episode_steps,
162 |             'sps': float(episode_steps) / duration,
163 |             'episode_reward': np.sum(self.rewards[episode]),
164 |             'reward_mean': np.mean(self.rewards[episode]),
165 |             'reward_min': np.min(self.rewards[episode]),
166 |             'reward_max': np.max(self.rewards[episode]),
167 |             'action_mean': np.mean(self.actions[episode]),
168 |             'action_min': np.min(self.actions[episode]),
169 |             'action_max': np.max(self.actions[episode]),
170 |             'obs_mean': np.mean(self.observations[episode]),
171 |             'obs_min': np.min(self.observations[episode]),
172 |             'obs_max': np.max(self.observations[episode]),
173 |             'metrics': metrics_text,
174 |         }
175 |         print(template.format(**variables))
176 | 
177 |         # Free up resources.
178 |         del self.episode_start[episode]
179 |         del self.observations[episode]
180 |         del self.rewards[episode]
181 |         del self.actions[episode]
182 |         del self.metrics[episode]
183 | 
184 |     def on_step_end(self, step, logs):
185 |         episode = logs['episode']
186 |         self.observations[episode].append(logs['observation'])
187 |         self.rewards[episode].append(logs['reward'])
188 |         self.actions[episode].append(logs['action'])
189 |         self.metrics[episode].append(logs['metrics'])
190 |         self.step += 1
191 | 
192 | 
193 | class TrainIntervalLogger(Callback):
194 |     def __init__(self, interval=10000):
195 |         self.interval = interval
196 |         self.step = 0
197 |         self.reset()
198 | 
199 |     def reset(self):
200 |         self.interval_start = timeit.default_timer()
201 |         self.progbar = Progbar(target=self.interval)
202 |         self.metrics = []
203 |         self.infos = []
204 |         self.info_names = None
205 |         self.episode_rewards = []
206 | 
207 |     def on_train_begin(self, logs):
208 |         self.train_start = timeit.default_timer()
209 |         self.metrics_names = self.model.metrics_names
210 |         print('Training for {} steps ...'.format(self.params['nb_steps']))
211 | 
212 |     def on_train_end(self, logs):
213 |         duration = timeit.default_timer() - self.train_start
214 |         print('done, took {:.3f} seconds'.format(duration))
215 | 
216 |     def on_step_begin(self, step, logs):
217 |         if self.step % self.interval == 0:
218 |             if len(self.episode_rewards) > 0:
219 |                 metrics = np.array(self.metrics)
220 |                 assert metrics.shape == (self.interval, len(self.metrics_names))
221 |                 formatted_metrics = ''
222 |                 if not np.isnan(metrics).all():  # not all values are means
223 |                     means = np.nanmean(self.metrics, axis=0)
224 |                     assert means.shape == (len(self.metrics_names),)
225 |                     for name, mean in zip(self.metrics_names, means):
226 |                         formatted_metrics += ' - {}: {:.3f}'.format(name, mean)
227 |                 
228 |                 formatted_infos = ''
229 |                 if len(self.infos) > 0:
230 |                     infos = np.array(self.infos)
231 |                     if not np.isnan(infos).all():  # not all values are means
232 |                         means = np.nanmean(self.infos, axis=0)
233 |                         assert means.shape == (len(self.info_names),)
234 |                         for name, mean in zip(self.info_names, means):
235 |                             formatted_infos += ' - {}: {:.3f}'.format(name, mean)
236 |                 print('{} episodes - episode_reward: {:.3f} [{:.3f}, {:.3f}]{}{}'.format(len(self.episode_rewards), np.mean(self.episode_rewards), np.min(self.episode_rewards), np.max(self.episode_rewards), formatted_metrics, formatted_infos))
237 |                 print('')
238 |             self.reset()
239 |             print('Interval {} ({} steps performed)'.format(self.step // self.interval + 1, self.step))
240 | 
241 |     def on_step_end(self, step, logs):
242 |         if self.info_names is None:
243 |             self.info_names = logs['info'].keys()
244 |         values = [('reward', logs['reward'])]
245 |         self.progbar.update((self.step % self.interval) + 1, values=values, force=True)
246 |         self.step += 1
247 |         self.metrics.append(logs['metrics'])
248 |         if len(self.info_names) > 0:
249 |             self.infos.append([logs['info'][k] for k in self.info_names])
250 | 
251 |     def on_episode_end(self, episode, logs):
252 |         self.episode_rewards.append(logs['episode_reward'])
253 | 
254 | 
255 | class FileLogger(Callback):
256 |     def __init__(self, filepath, interval=None):
257 |         self.filepath = filepath
258 |         self.interval = interval
259 | 
260 |         # Some algorithms compute multiple episodes at once since they are multi-threaded.
261 |         # We therefore use a dict that maps from episode to metrics array.
262 |         self.metrics = {}
263 |         self.starts = {}
264 |         self.data = {}
265 | 
266 |     def on_train_begin(self, logs):
267 |         self.metrics_names = self.model.metrics_names
268 | 
269 |     def on_train_end(self, logs):
270 |         self.save_data()
271 | 
272 |     def on_episode_begin(self, episode, logs):
273 |         assert episode not in self.metrics
274 |         assert episode not in self.starts
275 |         self.metrics[episode] = []
276 |         self.starts[episode] = timeit.default_timer()
277 | 
278 |     def on_episode_end(self, episode, logs):
279 |         duration = timeit.default_timer() - self.starts[episode]
280 |         
281 |         metrics = self.metrics[episode]
282 |         if np.isnan(metrics).all():
283 |             mean_metrics = np.array([np.nan for _ in self.metrics_names])
284 |         else:
285 |             mean_metrics = np.nanmean(metrics, axis=0)
286 |         assert len(mean_metrics) == len(self.metrics_names)
287 | 
288 |         data = list(zip(self.metrics_names, mean_metrics))
289 |         data += list(logs.items())
290 |         data += [('episode', episode), ('duration', duration)]
291 |         for key, value in data:
292 |             if key not in self.data:
293 |                 self.data[key] = []
294 |             self.data[key].append(value)
295 | 
296 |         if self.interval is not None and episode % self.interval == 0:
297 |             self.save_data()
298 | 
299 |         # Clean up.
300 |         del self.metrics[episode]
301 |         del self.starts[episode]
302 | 
303 |     def on_step_end(self, step, logs):
304 |         self.metrics[logs['episode']].append(logs['metrics'])
305 | 
306 |     def save_data(self):
307 |         if len(self.data.keys()) == 0:
308 |             return
309 | 
310 |         # Sort everything by episode.
311 |         assert 'episode' in self.data
312 |         sorted_indexes = np.argsort(self.data['episode'])
313 |         sorted_data = {}
314 |         for key, values in self.data.items():
315 |             assert len(self.data[key]) == len(sorted_indexes)
316 |             # We convert to np.array() and then to list to convert from np datatypes to native datatypes.
317 |             # This is necessary because json.dump cannot handle np.float32, for example.
318 |             sorted_data[key] = np.array([self.data[key][idx] for idx in sorted_indexes]).tolist()
319 | 
320 |         # Overwrite already open file. We can simply seek to the beginning since the file will
321 |         # grow strictly monotonously.
322 |         with open(self.filepath, 'w') as f:
323 |             json.dump(sorted_data, f)
324 | 
325 | 
326 | class Visualizer(Callback):
327 |     def on_action_end(self, action, logs):
328 |         self.env.render(mode='human')
329 | 
330 | 
331 | class ModelIntervalCheckpoint(Callback):
332 |     def __init__(self, filepath, interval, verbose=0):
333 |         super(ModelIntervalCheckpoint, self).__init__()
334 |         self.filepath = filepath
335 |         self.interval = interval
336 |         self.verbose = verbose
337 |         self.total_steps = 0
338 | 
339 |     def on_step_end(self, step, logs={}):
340 |         self.total_steps += 1
341 |         if self.total_steps % self.interval != 0:
342 |             # Nothing to do.
343 |             return
344 | 
345 |         filepath = self.filepath.format(step=self.total_steps, **logs)
346 |         if self.verbose > 0:
347 |             print('Step {}: saving model to {}'.format(self.total_steps, filepath))
348 |         self.model.save_weights(filepath, overwrite=True)
349 | 


--------------------------------------------------------------------------------
/rl/agents/ddpg.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from collections import deque
  3 | import os
  4 | import warnings
  5 | 
  6 | import numpy as np
  7 | import keras.backend as K
  8 | import keras.optimizers as optimizers
  9 | 
 10 | from rl.core import Agent
 11 | from rl.random import OrnsteinUhlenbeckProcess
 12 | from rl.util import *
 13 | 
 14 | 
 15 | def mean_q(y_true, y_pred):
 16 |     return K.mean(K.max(y_pred, axis=-1))
 17 | 
 18 | 
 19 | # Deep DPG as described by Lillicrap et al. (2015)
 20 | # http://arxiv.org/pdf/1509.02971v2.pdf
 21 | # http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.646.4324&rep=rep1&type=pdf
 22 | class DDPGAgent(Agent):
 23 |     """Write me
 24 |     """
 25 |     def __init__(self, nb_actions, actor, critic, critic_action_input, memory,
 26 |                  gamma=.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
 27 |                  train_interval=1, memory_interval=1, delta_range=None, delta_clip=np.inf,
 28 |                  random_process=None, custom_model_objects={}, target_model_update=.001, **kwargs):
 29 |         if hasattr(actor.output, '__len__') and len(actor.output) > 1:
 30 |             raise ValueError('Actor "{}" has more than one output. DDPG expects an actor that has a single output.'.format(actor))
 31 |         if hasattr(critic.output, '__len__') and len(critic.output) > 1:
 32 |             raise ValueError('Critic "{}" has more than one output. DDPG expects a critic that has a single output.'.format(critic))
 33 |         if critic_action_input not in critic.input:
 34 |             raise ValueError('Critic "{}" does not have designated action input "{}".'.format(critic, critic_action_input))
 35 |         if not hasattr(critic.input, '__len__') or len(critic.input) < 2:
 36 |             raise ValueError('Critic "{}" does not have enough inputs. The critic must have at exactly two inputs, one for the action and one for the observation.'.format(critic))
 37 | 
 38 |         super(DDPGAgent, self).__init__(**kwargs)
 39 | 
 40 |         # Soft vs hard target model updates.
 41 |         if target_model_update < 0:
 42 |             raise ValueError('`target_model_update` must be >= 0.')
 43 |         elif target_model_update >= 1:
 44 |             # Hard update every `target_model_update` steps.
 45 |             target_model_update = int(target_model_update)
 46 |         else:
 47 |             # Soft update with `(1 - target_model_update) * old + target_model_update * new`.
 48 |             target_model_update = float(target_model_update)
 49 | 
 50 |         if delta_range is not None:
 51 |             warnings.warn('`delta_range` is deprecated. Please use `delta_clip` instead, which takes a single scalar. For now we\'re falling back to `delta_range[1] = {}`'.format(delta_range[1]))
 52 |             delta_clip = delta_range[1]
 53 | 
 54 |         # Parameters.
 55 |         self.nb_actions = nb_actions
 56 |         self.nb_steps_warmup_actor = nb_steps_warmup_actor
 57 |         self.nb_steps_warmup_critic = nb_steps_warmup_critic
 58 |         self.random_process = random_process
 59 |         self.delta_clip = delta_clip
 60 |         self.gamma = gamma
 61 |         self.target_model_update = target_model_update
 62 |         self.batch_size = batch_size
 63 |         self.train_interval = train_interval
 64 |         self.memory_interval = memory_interval
 65 |         self.custom_model_objects = custom_model_objects
 66 | 
 67 |         # Related objects.
 68 |         self.actor = actor
 69 |         self.critic = critic
 70 |         self.critic_action_input = critic_action_input
 71 |         self.critic_action_input_idx = self.critic.input.index(critic_action_input)
 72 |         self.memory = memory
 73 | 
 74 |         # State.
 75 |         self.compiled = False
 76 |         self.reset_states()
 77 | 
 78 |     @property
 79 |     def uses_learning_phase(self):
 80 |         return self.actor.uses_learning_phase or self.critic.uses_learning_phase
 81 | 
 82 |     def compile(self, optimizer, metrics=[]):
 83 |         metrics += [mean_q]
 84 | 
 85 |         if type(optimizer) in (list, tuple):
 86 |             if len(optimizer) != 2:
 87 |                 raise ValueError('More than two optimizers provided. Please only provide a maximum of two optimizers, the first one for the actor and the second one for the critic.')
 88 |             actor_optimizer, critic_optimizer = optimizer
 89 |         else:
 90 |             actor_optimizer = optimizer
 91 |             critic_optimizer = clone_optimizer(optimizer)
 92 |         if type(actor_optimizer) is str:
 93 |             actor_optimizer = optimizers.get(actor_optimizer)
 94 |         if type(critic_optimizer) is str:
 95 |             critic_optimizer = optimizers.get(critic_optimizer)
 96 |         assert actor_optimizer != critic_optimizer
 97 | 
 98 |         if len(metrics) == 2 and hasattr(metrics[0], '__len__') and hasattr(metrics[1], '__len__'):
 99 |             actor_metrics, critic_metrics = metrics
100 |         else:
101 |             actor_metrics = critic_metrics = metrics
102 | 
103 |         def clipped_error(y_true, y_pred):
104 |             return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1)
105 | 
106 |         # Compile target networks. We only use them in feed-forward mode, hence we can pass any
107 |         # optimizer and loss since we never use it anyway.
108 |         self.target_actor = clone_model(self.actor, self.custom_model_objects)
109 |         self.target_actor.compile(optimizer='sgd', loss='mse')
110 |         self.target_critic = clone_model(self.critic, self.custom_model_objects)
111 |         self.target_critic.compile(optimizer='sgd', loss='mse')
112 | 
113 |         # We also compile the actor. We never optimize the actor using Keras but instead compute
114 |         # the policy gradient ourselves. However, we need the actor in feed-forward mode, hence
115 |         # we also compile it with any optimzer and
116 |         self.actor.compile(optimizer='sgd', loss='mse')
117 | 
118 |         # Compile the critic.
119 |         if self.target_model_update < 1.:
120 |             # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
121 |             critic_updates = get_soft_target_model_updates(self.target_critic, self.critic, self.target_model_update)
122 |             critic_optimizer = AdditionalUpdatesOptimizer(critic_optimizer, critic_updates)
123 |         self.critic.compile(optimizer=critic_optimizer, loss=clipped_error, metrics=critic_metrics)
124 | 
125 |         # Combine actor and critic so that we can get the policy gradient.
126 |         # Assuming critic's state inputs are the same as actor's.
127 |         combined_inputs = []
128 |         critic_inputs = []
129 |         for i in self.critic.input:
130 |             if i == self.critic_action_input:
131 |                 combined_inputs.append([])
132 |             else:
133 |                 combined_inputs.append(i)
134 |                 critic_inputs.append(i)
135 |         combined_inputs[self.critic_action_input_idx] = self.actor(critic_inputs)
136 | 
137 |         combined_output = self.critic(combined_inputs)
138 | 
139 |         updates = actor_optimizer.get_updates(self.actor.trainable_weights, self.actor.constraints,
140 |                                               loss=-K.mean(combined_output))
141 |         if self.target_model_update < 1.:
142 |             # Include soft target model updates.
143 |             updates += get_soft_target_model_updates(self.target_actor, self.actor, self.target_model_update)
144 |         updates += self.actor.updates  # include other updates of the actor, e.g. for BN
145 | 
146 |         # Finally, combine it all into a callable function.
147 |         if self.uses_learning_phase:
148 |             critic_inputs += [K.learning_phase()]
149 |         self.actor_train_fn = K.function(critic_inputs, [self.actor(critic_inputs)], updates=updates)
150 |         self.actor_optimizer = actor_optimizer
151 | 
152 |         self.compiled = True
153 | 
154 |     def load_weights(self, filepath):
155 |         filename, extension = os.path.splitext(filepath)
156 |         actor_filepath = filename + '_actor' + extension
157 |         critic_filepath = filename + '_critic' + extension
158 |         self.actor.load_weights(actor_filepath)
159 |         self.critic.load_weights(critic_filepath)
160 |         self.update_target_models_hard()
161 | 
162 |     def save_weights(self, filepath, overwrite=False):
163 |         filename, extension = os.path.splitext(filepath)
164 |         actor_filepath = filename + '_actor' + extension
165 |         critic_filepath = filename + '_critic' + extension
166 |         self.actor.save_weights(actor_filepath, overwrite=overwrite)
167 |         self.critic.save_weights(critic_filepath, overwrite=overwrite)
168 | 
169 |     def update_target_models_hard(self):
170 |         self.target_critic.set_weights(self.critic.get_weights())
171 |         self.target_actor.set_weights(self.actor.get_weights())
172 | 
173 |     # TODO: implement pickle
174 | 
175 |     def reset_states(self):
176 |         if self.random_process is not None:
177 |             self.random_process.reset_states()
178 |         self.recent_action = None
179 |         self.recent_observation = None
180 |         if self.compiled:
181 |             self.actor.reset_states()
182 |             self.critic.reset_states()
183 |             self.target_actor.reset_states()
184 |             self.target_critic.reset_states()
185 | 
186 |     def process_state_batch(self, batch):
187 |         batch = np.array(batch)
188 |         if self.processor is None:
189 |             return batch
190 |         return self.processor.process_state_batch(batch)
191 | 
192 |     def select_action(self, state):
193 |         batch = self.process_state_batch([state])
194 |         action = self.actor.predict_on_batch(batch).flatten()
195 |         assert action.shape == (self.nb_actions,)
196 | 
197 |         # Apply noise, if a random process is set.
198 |         if self.training and self.random_process is not None:
199 |             noise = self.random_process.sample()
200 |             assert noise.shape == action.shape
201 |             action += noise
202 | 
203 |         return action
204 | 
205 |     def forward(self, observation):
206 |         # Select an action.
207 |         state = self.memory.get_recent_state(observation)
208 |         action = self.select_action(state)  # TODO: move this into policy
209 |         if self.processor is not None:
210 |             action = self.processor.process_action(action)
211 | 
212 |         # Book-keeping.
213 |         self.recent_observation = observation
214 |         self.recent_action = action
215 | 
216 |         return action
217 | 
218 |     @property
219 |     def layers(self):
220 |         return self.actor.layers[:] + self.critic.layers[:]
221 | 
222 |     @property
223 |     def metrics_names(self):
224 |         names = self.critic.metrics_names[:]
225 |         if self.processor is not None:
226 |             names += self.processor.metrics_names[:]
227 |         return names
228 | 
229 |     def backward(self, reward, terminal=False):
230 |         # Store most recent experience in memory.
231 |         if self.step % self.memory_interval == 0:
232 |             self.memory.append(self.recent_observation, self.recent_action, reward, terminal,
233 |                                training=self.training)
234 | 
235 |         metrics = [np.nan for _ in self.metrics_names]
236 |         if not self.training:
237 |             # We're done here. No need to update the experience memory since we only use the working
238 |             # memory to obtain the state over the most recent observations.
239 |             return metrics
240 | 
241 |         # Train the network on a single stochastic batch.
242 |         can_train_either = self.step > self.nb_steps_warmup_critic or self.step > self.nb_steps_warmup_actor
243 |         if can_train_either and self.step % self.train_interval == 0:
244 |             experiences = self.memory.sample(self.batch_size)
245 |             assert len(experiences) == self.batch_size
246 | 
247 |             # Start by extracting the necessary parameters (we use a vectorized implementation).
248 |             state0_batch = []
249 |             reward_batch = []
250 |             action_batch = []
251 |             terminal1_batch = []
252 |             state1_batch = []
253 |             for e in experiences:
254 |                 state0_batch.append(e.state0)
255 |                 state1_batch.append(e.state1)
256 |                 reward_batch.append(e.reward)
257 |                 action_batch.append(e.action)
258 |                 terminal1_batch.append(0. if e.terminal1 else 1.)
259 | 
260 |             # Prepare and validate parameters.
261 |             state0_batch = self.process_state_batch(state0_batch)
262 |             state1_batch = self.process_state_batch(state1_batch)
263 |             terminal1_batch = np.array(terminal1_batch)
264 |             reward_batch = np.array(reward_batch)
265 |             action_batch = np.array(action_batch)
266 |             assert reward_batch.shape == (self.batch_size,)
267 |             assert terminal1_batch.shape == reward_batch.shape
268 |             assert action_batch.shape == (self.batch_size, self.nb_actions)
269 | 
270 |             # Update critic, if warm up is over.
271 |             if self.step > self.nb_steps_warmup_critic:
272 |                 target_actions = self.target_actor.predict_on_batch(state1_batch)
273 |                 assert target_actions.shape == (self.batch_size, self.nb_actions)
274 |                 if len(self.critic.inputs) >= 3:
275 |                     state1_batch_with_action = state1_batch[:]
276 |                 else:
277 |                     state1_batch_with_action = [state1_batch]
278 |                 state1_batch_with_action.insert(self.critic_action_input_idx, target_actions)
279 |                 target_q_values = self.target_critic.predict_on_batch(state1_batch_with_action).flatten()
280 |                 assert target_q_values.shape == (self.batch_size,)
281 | 
282 |                 # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target ys accordingly,
283 |                 # but only for the affected output units (as given by action_batch).
284 |                 discounted_reward_batch = self.gamma * target_q_values
285 |                 discounted_reward_batch *= terminal1_batch
286 |                 assert discounted_reward_batch.shape == reward_batch.shape
287 |                 targets = (reward_batch + discounted_reward_batch).reshape(self.batch_size, 1)
288 | 
289 |                 # Perform a single batch update on the critic network.
290 |                 if len(self.critic.inputs) >= 3:
291 |                     state0_batch_with_action = state0_batch[:]
292 |                 else:
293 |                     state0_batch_with_action = [state0_batch]
294 |                 state0_batch_with_action.insert(self.critic_action_input_idx, action_batch)
295 |                 metrics = self.critic.train_on_batch(state0_batch_with_action, targets)
296 |                 if self.processor is not None:
297 |                     metrics += self.processor.metrics
298 | 
299 |             # Update actor, if warm up is over.
300 |             if self.step > self.nb_steps_warmup_actor:
301 |                 # TODO: implement metrics for actor
302 |                 if len(self.actor.inputs) >= 2:
303 |                     inputs = state0_batch[:]
304 |                 else:
305 |                     inputs = [state0_batch]
306 |                 if self.uses_learning_phase:
307 |                     inputs += [self.training]
308 |                 action_values = self.actor_train_fn(inputs)[0]
309 |                 assert action_values.shape == (self.batch_size, self.nb_actions)
310 | 
311 |         if self.target_model_update >= 1 and self.step % self.target_model_update == 0:
312 |             self.update_target_models_hard()
313 | 
314 |         return metrics
315 | 


--------------------------------------------------------------------------------
/rl/core.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import warnings
  3 | from copy import deepcopy
  4 | 
  5 | import numpy as np
  6 | from keras.callbacks import History
  7 | 
  8 | from rl.callbacks import TestLogger, TrainEpisodeLogger, TrainIntervalLogger, Visualizer, CallbackList
  9 | 
 10 | 
 11 | class Agent(object):
 12 |     """Abstract base class for all implemented agents.
 13 | 
 14 |     Each agent interacts with the environment (as defined by the `Env` class) by first observing the
 15 |     state of the environment. Based on this observation the agent changes the environment by performing
 16 |     an action.
 17 | 
 18 |     Do not use this abstract base class directly but instead use one of the concrete agents implemented.
 19 |     Each agent realizes a reinforcement learning algorithm. Since all agents conform to the same
 20 |     interface, you can use them interchangeably.
 21 | 
 22 |     To implement your own agent, you have to implement the following methods:
 23 | 
 24 |     - `forward`
 25 |     - `backward`
 26 |     - `compile`
 27 |     - `load_weights`
 28 |     - `save_weights`
 29 |     - `layers`
 30 | 
 31 |     # Arguments
 32 |         processor (`Processor` instance): See [Processor](#processor) for details.
 33 |     """
 34 |     def __init__(self, processor=None):
 35 |         self.processor = processor
 36 |         self.training = False
 37 |         self.step = 0
 38 | 
 39 |     def get_config(self):
 40 |         """Configuration of the agent for serialization.
 41 |         """
 42 |         return {}
 43 | 
 44 |     def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1,
 45 |             visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000,
 46 |             nb_max_episode_steps=None):
 47 |         """Trains the agent on the given environment.
 48 | 
 49 |         # Arguments
 50 |             env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details.
 51 |             nb_steps (integer): Number of training steps to be performed.
 52 |             action_repetition (integer): Number of times the agent repeats the same action without
 53 |                 observing the environment again. Setting this to a value > 1 can be useful
 54 |                 if a single action only has a very small effect on the environment.
 55 |             callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances):
 56 |                 List of callbacks to apply during training. See [callbacks](/callbacks) for details.
 57 |             verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging
 58 |             visualize (boolean): If `True`, the environment is visualized during training. However,
 59 |                 this is likely going to slow down training significantly and is thus intended to be
 60 |                 a debugging instrument.
 61 |             nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning
 62 |                 of each episode using `start_step_policy`. Notice that this is an upper limit since
 63 |                 the exact number of steps to be performed is sampled uniformly from [0, max_start_steps]
 64 |                 at the beginning of each episode.
 65 |             start_step_policy (`lambda observation: action`): The policy
 66 |                 to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed.
 67 |             log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval.
 68 |             nb_max_episode_steps (integer): Number of steps per episode that the agent performs before
 69 |                 automatically resetting the environment. Set to `None` if each episode should run
 70 |                 (potentially indefinitely) until the environment signals a terminal state.
 71 | 
 72 |         # Returns
 73 |             A `keras.callbacks.History` instance that recorded the entire training process.
 74 |         """
 75 |         if not self.compiled:
 76 |             raise RuntimeError('Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.')
 77 |         if action_repetition < 1:
 78 |             raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition))
 79 | 
 80 |         self.training = True
 81 | 
 82 |         callbacks = [] if not callbacks else callbacks[:]
 83 | 
 84 |         if verbose == 1:
 85 |             callbacks += [TrainIntervalLogger(interval=log_interval)]
 86 |         elif verbose > 1:
 87 |             callbacks += [TrainEpisodeLogger()]
 88 |         if visualize:
 89 |             callbacks += [Visualizer()]
 90 |         history = History()
 91 |         callbacks += [history]
 92 |         callbacks = CallbackList(callbacks)
 93 |         if hasattr(callbacks, 'set_model'):
 94 |             callbacks.set_model(self)
 95 |         else:
 96 |             callbacks._set_model(self)
 97 |         callbacks._set_env(env)
 98 |         params = {
 99 |             'nb_steps': nb_steps,
100 |         }
101 |         if hasattr(callbacks, 'set_params'):
102 |             callbacks.set_params(params)
103 |         else:
104 |             callbacks._set_params(params)
105 |         self._on_train_begin()
106 |         callbacks.on_train_begin()
107 | 
108 |         episode = 0
109 |         self.step = 0
110 |         observation = None
111 |         episode_reward = None
112 |         episode_step = None
113 |         did_abort = False
114 |         try:
115 |             while self.step < nb_steps:
116 |                 if observation is None:  # start of a new episode
117 |                     callbacks.on_episode_begin(episode)
118 |                     episode_step = 0
119 |                     episode_reward = 0.
120 | 
121 |                     # Obtain the initial observation by resetting the environment.
122 |                     self.reset_states()
123 |                     observation = deepcopy(env.reset())
124 |                     if self.processor is not None:
125 |                         observation = self.processor.process_observation(observation)
126 |                     assert observation is not None
127 | 
128 |                     # Perform random starts at beginning of episode and do not record them into the experience.
129 |                     # This slightly changes the start position between games.
130 |                     nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps)
131 |                     for _ in range(nb_random_start_steps):
132 |                         if start_step_policy is None:
133 |                             action = env.action_space.sample()
134 |                         else:
135 |                             action = start_step_policy(observation)
136 |                         if self.processor is not None:
137 |                             action = self.processor.process_action(action)
138 |                         callbacks.on_action_begin(action)
139 |                         observation, reward, done, info = env.step(action)
140 |                         observation = deepcopy(observation)
141 |                         if self.processor is not None:
142 |                             observation, reward, done, info = self.processor.process_step(observation, reward, done, info)
143 |                         callbacks.on_action_end(action)
144 |                         if done:
145 |                             warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps))
146 |                             observation = deepcopy(env.reset())
147 |                             if self.processor is not None:
148 |                                 observation = self.processor.process_observation(observation)
149 |                             break
150 | 
151 |                 # At this point, we expect to be fully initialized.
152 |                 assert episode_reward is not None
153 |                 assert episode_step is not None
154 |                 assert observation is not None
155 | 
156 |                 # Run a single step.
157 |                 callbacks.on_step_begin(episode_step)
158 |                 # This is were all of the work happens. We first perceive and compute the action
159 |                 # (forward step) and then use the reward to improve (backward step).
160 |                 action = self.forward(observation)
161 |                 if self.processor is not None:
162 |                     action = self.processor.process_action(action)
163 |                 reward = 0.
164 |                 accumulated_info = {}
165 |                 done = False
166 |                 for _ in range(action_repetition):
167 |                     callbacks.on_action_begin(action)
168 |                     observation, r, done, info = env.step(action)
169 |                     observation = deepcopy(observation)
170 |                     if self.processor is not None:
171 |                         observation, r, done, info = self.processor.process_step(observation, r, done, info)
172 |                     for key, value in info.items():
173 |                         if not np.isreal(value):
174 |                             continue
175 |                         if key not in accumulated_info:
176 |                             accumulated_info[key] = np.zeros_like(value)
177 |                         accumulated_info[key] += value
178 |                     callbacks.on_action_end(action)
179 |                     reward += r
180 |                     if done:
181 |                         break
182 |                 if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
183 |                     # Force a terminal state.
184 |                     done = True
185 |                 metrics = self.backward(reward, terminal=done)
186 |                 episode_reward += reward
187 | 
188 |                 step_logs = {
189 |                     'action': action,
190 |                     'observation': observation,
191 |                     'reward': reward,
192 |                     'metrics': metrics,
193 |                     'episode': episode,
194 |                     'info': accumulated_info,
195 |                 }
196 |                 callbacks.on_step_end(episode_step, step_logs)
197 |                 episode_step += 1
198 |                 self.step += 1
199 | 
200 |                 if done:
201 |                     # We are in a terminal state but the agent hasn't yet seen it. We therefore
202 |                     # perform one more forward-backward call and simply ignore the action before
203 |                     # resetting the environment. We need to pass in `terminal=False` here since
204 |                     # the *next* state, that is the state of the newly reset environment, is
205 |                     # always non-terminal by convention.
206 |                     self.forward(observation)
207 |                     self.backward(0., terminal=False)
208 | 
209 |                     # This episode is finished, report and reset.
210 |                     episode_logs = {
211 |                         'episode_reward': episode_reward,
212 |                         'nb_episode_steps': episode_step,
213 |                         'nb_steps': self.step,
214 |                     }
215 |                     callbacks.on_episode_end(episode, episode_logs)
216 | 
217 |                     episode += 1
218 |                     observation = None
219 |                     episode_step = None
220 |                     episode_reward = None
221 |         except KeyboardInterrupt:
222 |             # We catch keyboard interrupts here so that training can be be safely aborted.
223 |             # This is so common that we've built this right into this function, which ensures that
224 |             # the `on_train_end` method is properly called.
225 |             did_abort = True
226 |         callbacks.on_train_end(logs={'did_abort': did_abort})
227 |         self._on_train_end()
228 | 
229 |         return history
230 | 
231 |     def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True,
232 |              nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1):
233 |         """Callback that is called before training begins."
234 |         """
235 |         if not self.compiled:
236 |             raise RuntimeError('Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.')
237 |         if action_repetition < 1:
238 |             raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition))
239 | 
240 |         self.training = False
241 |         self.step = 0
242 | 
243 |         callbacks = [] if not callbacks else callbacks[:]
244 | 
245 |         if verbose >= 1:
246 |             callbacks += [TestLogger()]
247 |         if visualize:
248 |             callbacks += [Visualizer()]
249 |         history = History()
250 |         callbacks += [history]
251 |         callbacks = CallbackList(callbacks)
252 |         if hasattr(callbacks, 'set_model'):
253 |             callbacks.set_model(self)
254 |         else:
255 |             callbacks._set_model(self)
256 |         callbacks._set_env(env)
257 |         params = {
258 |             'nb_episodes': nb_episodes,
259 |         }
260 |         if hasattr(callbacks, 'set_params'):
261 |             callbacks.set_params(params)
262 |         else:
263 |             callbacks._set_params(params)
264 | 
265 |         self._on_test_begin()
266 |         callbacks.on_train_begin()
267 |         for episode in range(nb_episodes):
268 |             callbacks.on_episode_begin(episode)
269 |             episode_reward = 0.
270 |             episode_step = 0
271 | 
272 |             # Obtain the initial observation by resetting the environment.
273 |             self.reset_states()
274 |             observation = deepcopy(env.reset())
275 |             if self.processor is not None:
276 |                 observation = self.processor.process_observation(observation)
277 |             assert observation is not None
278 | 
279 |             # Perform random starts at beginning of episode and do not record them into the experience.
280 |             # This slightly changes the start position between games.
281 |             nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps)
282 |             for _ in range(nb_random_start_steps):
283 |                 if start_step_policy is None:
284 |                     action = env.action_space.sample()
285 |                 else:
286 |                     action = start_step_policy(observation)
287 |                 if self.processor is not None:
288 |                     action = self.processor.process_action(action)
289 |                 callbacks.on_action_begin(action)
290 |                 observation, r, done, info = env.step(action)
291 |                 observation = deepcopy(observation)
292 |                 if self.processor is not None:
293 |                     observation, r, done, info = self.processor.process_step(observation, r, done, info)
294 |                 callbacks.on_action_end(action)
295 |                 if done:
296 |                     warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps))
297 |                     observation = deepcopy(env.reset())
298 |                     if self.processor is not None:
299 |                         observation = self.processor.process_observation(observation)
300 |                     break
301 | 
302 |             # Run the episode until we're done.
303 |             done = False
304 |             while not done:
305 |                 callbacks.on_step_begin(episode_step)
306 | 
307 |                 action = self.forward(observation)
308 |                 if self.processor is not None:
309 |                     action = self.processor.process_action(action)
310 |                 reward = 0.
311 |                 accumulated_info = {}
312 |                 for _ in range(action_repetition):
313 |                     callbacks.on_action_begin(action)
314 |                     observation, r, d, info = env.step(action)
315 |                     observation = deepcopy(observation)
316 |                     if self.processor is not None:
317 |                         observation, r, d, info = self.processor.process_step(observation, r, d, info)
318 |                     callbacks.on_action_end(action)
319 |                     reward += r
320 |                     for key, value in info.items():
321 |                         if not np.isreal(value):
322 |                             continue
323 |                         if key not in accumulated_info:
324 |                             accumulated_info[key] = np.zeros_like(value)
325 |                         accumulated_info[key] += value
326 |                     if d:
327 |                         done = True
328 |                         break
329 |                 if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1:
330 |                     done = True
331 |                 self.backward(reward, terminal=done)
332 |                 episode_reward += reward
333 | 
334 |                 step_logs = {
335 |                     'action': action,
336 |                     'observation': observation,
337 |                     'reward': reward,
338 |                     'episode': episode,
339 |                     'info': accumulated_info,
340 |                 }
341 |                 callbacks.on_step_end(episode_step, step_logs)
342 |                 episode_step += 1
343 |                 self.step += 1
344 | 
345 |             # We are in a terminal state but the agent hasn't yet seen it. We therefore
346 |             # perform one more forward-backward call and simply ignore the action before
347 |             # resetting the environment. We need to pass in `terminal=False` here since
348 |             # the *next* state, that is the state of the newly reset environment, is
349 |             # always non-terminal by convention.
350 |             self.forward(observation)
351 |             self.backward(0., terminal=False)
352 | 
353 |             # Report end of episode.
354 |             episode_logs = {
355 |                 'episode_reward': episode_reward,
356 |                 'nb_steps': episode_step,
357 |             }
358 |             callbacks.on_episode_end(episode, episode_logs)
359 |         callbacks.on_train_end()
360 |         self._on_test_end()
361 | 
362 |         return history
363 | 
364 |     def reset_states(self):
365 |         """Resets all internally kept states after an episode is completed.
366 |         """
367 |         pass
368 | 
369 |     def forward(self, observation):
370 |         """Takes the an observation from the environment and returns the action to be taken next.
371 |         If the policy is implemented by a neural network, this corresponds to a forward (inference) pass.
372 | 
373 |         # Argument
374 |             observation (object): The current observation from the environment.
375 | 
376 |         # Returns
377 |             The next action to be executed in the environment.
378 |         """
379 |         raise NotImplementedError()
380 | 
381 |     def backward(self, reward, terminal):
382 |         """Updates the agent after having executed the action returned by `forward`.
383 |         If the policy is implemented by a neural network, this corresponds to a weight update using back-prop.
384 | 
385 |         # Argument
386 |             reward (float): The observed reward after executing the action returned by `forward`.
387 |             terminal (boolean): `True` if the new state of the environment is terminal.
388 |         """
389 |         raise NotImplementedError()
390 | 
391 |     def compile(self, optimizer, metrics=[]):
392 |         """Compiles an agent and the underlaying models to be used for training and testing.
393 | 
394 |         # Arguments
395 |             optimizer (`keras.optimizers.Optimizer` instance): The optimizer to be used during training.
396 |             metrics (list of functions `lambda y_true, y_pred: metric`): The metrics to run during training.
397 |         """
398 |         raise NotImplementedError()
399 | 
400 |     def load_weights(self, filepath):
401 |         """Loads the weights of an agent from an HDF5 file.
402 | 
403 |         # Arguments
404 |             filepath (str): The path to the HDF5 file.
405 |         """
406 |         raise NotImplementedError()
407 | 
408 |     def save_weights(self, filepath, overwrite=False):
409 |         """Saves the weights of an agent as an HDF5 file.
410 | 
411 |         # Arguments
412 |             filepath (str): The path to where the weights should be saved.
413 |             overwrite (boolean): If `False` and `filepath` already exists, raises an error.
414 |         """
415 |         raise NotImplementedError()
416 | 
417 |     @property
418 |     def layers(self):
419 |         """Returns all layers of the underlying model(s).
420 |         
421 |         If the concrete implementation uses multiple internal models,
422 |         this method returns them in a concatenated list.
423 |         """
424 |         raise NotImplementedError()
425 | 
426 |     @property
427 |     def metrics_names(self):
428 |         """The human-readable names of the agent's metrics. Must return as many names as there
429 |         are metrics (see also `compile`).
430 |         """
431 |         return []
432 | 
433 |     def _on_train_begin(self):
434 |         """Callback that is called before training begins."
435 |         """
436 |         pass
437 | 
438 |     def _on_train_end(self):
439 |         """Callback that is called after training ends."
440 |         """
441 |         pass
442 | 
443 |     def _on_test_begin(self):
444 |         """Callback that is called before testing begins."
445 |         """
446 |         pass
447 | 
448 |     def _on_test_end(self):
449 |         """Callback that is called after testing ends."
450 |         """
451 |         pass
452 | 
453 | 
454 | class Processor(object):
455 |     """Abstract base class for implementing processors.
456 | 
457 |     A processor acts as a coupling mechanism between an `Agent` and its `Env`. This can
458 |     be necessary if your agent has different requirements with respect to the form of the
459 |     observations, actions, and rewards of the environment. By implementing a custom processor,
460 |     you can effectively translate between the two without having to change the underlaying
461 |     implementation of the agent or environment.
462 | 
463 |     Do not use this abstract base class directly but instead use one of the concrete implementations
464 |     or write your own.
465 |     """
466 | 
467 |     def process_step(self, observation, reward, done, info):
468 |         """Processes an entire step by applying the processor to the observation, reward, and info arguments.
469 | 
470 |         # Arguments
471 |             observation (object): An observation as obtained by the environment.
472 |             reward (float): A reward as obtained by the environment.
473 |             done (boolean): `True` if the environment is in a terminal state, `False` otherwise.
474 |             info (dict): The debug info dictionary as obtained by the environment.
475 | 
476 |         # Returns
477 |             The tupel (observation, reward, done, reward) with with all elements after being processed.
478 |         """
479 |         observation = self.process_observation(observation)
480 |         reward = self.process_reward(reward)
481 |         info = self.process_info(info)
482 |         return observation, reward, done, info
483 | 
484 |     def process_observation(self, observation):
485 |         """Processes the observation as obtained from the environment for use in an agent and
486 |         returns it.
487 |         """
488 |         return observation
489 | 
490 |     def process_reward(self, reward):
491 |         """Processes the reward as obtained from the environment for use in an agent and
492 |         returns it.
493 |         """
494 |         return reward
495 | 
496 |     def process_info(self, info):
497 |         """Processes the info as obtained from the environment for use in an agent and
498 |         returns it.
499 |         """
500 |         return info
501 | 
502 |     def process_action(self, action):
503 |         """Processes an action predicted by an agent but before execution in an environment.
504 |         """
505 |         return action
506 | 
507 |     def process_state_batch(self, batch):
508 |         """Processes an entire batch of states and returns it.
509 |         """
510 |         return batch
511 | 
512 |     @property
513 |     def metrics(self):
514 |         """The metrics of the processor, which will be reported during training.
515 | 
516 |         # Returns
517 |             List of `lambda y_true, y_pred: metric` functions.
518 |         """
519 |         return []
520 | 
521 |     @property
522 |     def metrics_names(self):
523 |         """The human-readable names of the agent's metrics. Must return as many names as there
524 |         are metrics (see also `compile`).
525 |         """
526 |         return []
527 | 
528 | 
529 | # Note: the API of the `Env` and `Space` classes are taken from the OpenAI Gym implementation.
530 | # https://github.com/openai/gym/blob/master/gym/core.py
531 | 
532 | 
533 | class Env(object):
534 |     """The abstract environment class that is used by all agents. This class has the exact
535 |     same API that OpenAI Gym uses so that integrating with it is trivial. In contrast to the
536 |     OpenAI Gym implementation, this class only defines the abstract methods without any actual
537 |     implementation.
538 |     """
539 |     reward_range = (-np.inf, np.inf)
540 |     action_space = None
541 |     observation_space = None
542 | 
543 |     def step(self, action):
544 |         """Run one timestep of the environment's dynamics.
545 |         Accepts an action and returns a tuple (observation, reward, done, info).
546 | 
547 |         # Arguments
548 |             action (object): An action provided by the environment.
549 | 
550 |         # Returns
551 |             observation (object): Agent's observation of the current environment.
552 |             reward (float) : Amount of reward returned after previous action.
553 |             done (boolean): Whether the episode has ended, in which case further step() calls will return undefined results.
554 |             info (dict): Contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
555 |         """
556 |         raise NotImplementedError()
557 | 
558 |     def reset(self):
559 |         """
560 |         Resets the state of the environment and returns an initial observation.
561 |         
562 |         # Returns
563 |             observation (object): The initial observation of the space. Initial reward is assumed to be 0.
564 |         """
565 |         raise NotImplementedError()
566 | 
567 |     def render(self, mode='human', close=False):
568 |         """Renders the environment.
569 |         The set of supported modes varies per environment. (And some
570 |         environments do not support rendering at all.) 
571 |         
572 |         # Arguments
573 |             mode (str): The mode to render with.
574 |             close (bool): Close all open renderings.
575 |         """
576 |         raise NotImplementedError()
577 | 
578 |     def close(self):
579 |         """Override in your subclass to perform any necessary cleanup.
580 |         Environments will automatically close() themselves when
581 |         garbage collected or when the program exits.
582 |         """
583 |         raise NotImplementedError()
584 | 
585 |     def seed(self, seed=None):
586 |         """Sets the seed for this env's random number generator(s).
587 |         
588 |         # Returns
589 |             Returns the list of seeds used in this env's random number generators
590 |         """
591 |         raise NotImplementedError()
592 | 
593 |     def configure(self, *args, **kwargs):
594 |         """Provides runtime configuration to the environment.
595 |         This configuration should consist of data that tells your
596 |         environment how to run (such as an address of a remote server,
597 |         or path to your ImageNet data). It should not affect the
598 |         semantics of the environment.
599 |         """
600 |         raise NotImplementedError()
601 | 
602 |     def __del__(self):
603 |         self.close()
604 | 
605 |     def __str__(self):
606 |         return '<{} instance>'.format(type(self).__name__)
607 | 
608 | 
609 | class Space(object):
610 |     """Abstract model for a space that is used for the state and action spaces. This class has the
611 |     exact same API that OpenAI Gym uses so that integrating with it is trivial.
612 |     """
613 | 
614 |     def sample(self, seed=None):
615 |         """Uniformly randomly sample a random element of this space.
616 |         """
617 |         raise NotImplementedError()
618 | 
619 |     def contains(self, x):
620 |         """Return boolean specifying if x is a valid member of this space
621 |         """
622 |         raise NotImplementedError()
623 | 


--------------------------------------------------------------------------------