├── rl ├── __init__.py ├── agents │ ├── __init__.py │ ├── cem.py │ ├── sarsa.py │ └── ddpg.py ├── keras_future.py ├── random.py ├── processors.py ├── policy.py ├── util.py ├── memory.py ├── callbacks.py └── core.py ├── tests ├── __init__.py ├── rl │ ├── __init__.py │ ├── agents │ │ ├── __init__.py │ │ ├── test_cem.py │ │ ├── test_ddpg.py │ │ └── test_dqn.py │ ├── util.py │ ├── test_util.py │ ├── test_core.py │ └── test_memory.py └── integration │ ├── test_continuous.py │ └── test_discrete.py ├── docs ├── sources │ ├── index.md │ ├── agents │ │ ├── naf.md │ │ ├── sarsa.md │ │ ├── ddpg.md │ │ ├── cem.md │ │ ├── dqn.md │ │ └── overview.md │ ├── processors.md │ └── core.md ├── templates │ ├── index.md │ ├── core.md │ ├── processors.md │ └── agents │ │ ├── naf.md │ │ ├── ddpg.md │ │ ├── sarsa.md │ │ ├── cem.md │ │ ├── dqn.md │ │ └── overview.md ├── requirements.txt └── autogen.py ├── setup.cfg ├── assets ├── breakout.gif ├── cartpole.gif └── pendulum.gif ├── setup.py ├── mkdocs.yml ├── ISSUE_TEMPLATE.md ├── LICENSE ├── .gitignore ├── pytest.ini ├── examples ├── sarsa_cartpole.py ├── visualize_log.py ├── dqn_cartpole.py ├── cem_cartpole.py ├── duel_dqn_cartpole.py ├── ddpg_pendulum.py ├── ddpg_mujoco.py ├── naf_pendulum.py └── dqn_atari.py ├── .travis.yml └── README.md /rl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/sources/index.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/templates/index.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/rl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/rl/agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/templates/core.md: -------------------------------------------------------------------------------- 1 | {{autogenerated}} 2 | -------------------------------------------------------------------------------- /docs/templates/processors.md: -------------------------------------------------------------------------------- 1 | {{autogenerated}} 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | mkdocs 3 | python-markdown-math 4 | -------------------------------------------------------------------------------- /assets/breakout.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cafe/keras-rl/master/assets/breakout.gif -------------------------------------------------------------------------------- /assets/cartpole.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cafe/keras-rl/master/assets/cartpole.gif -------------------------------------------------------------------------------- /assets/pendulum.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cafe/keras-rl/master/assets/pendulum.gif -------------------------------------------------------------------------------- /rl/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from .dqn import DQNAgent, NAFAgent, ContinuousDQNAgent 3 | from .ddpg import DDPGAgent 4 | from .cem import CEMAgent 5 | from .sarsa import SarsaAgent, SARSAAgent 6 | -------------------------------------------------------------------------------- /docs/templates/agents/naf.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | {{autogenerated}} 6 | 7 | --- 8 | 9 | ### References 10 | - [Continuous Deep Q-Learning with Model-based Acceleration](https://arxiv.org/abs/1603.00748), Gu et al., 2016 11 | -------------------------------------------------------------------------------- /docs/templates/agents/ddpg.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | {{autogenerated}} 6 | 7 | --- 8 | 9 | ### References 10 | - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971), Lillicrap et al., 2015 11 | -------------------------------------------------------------------------------- /docs/templates/agents/sarsa.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | {{autogenerated}} 6 | 7 | --- 8 | 9 | ### References 10 | - [Reinforcement learning: An introduction](http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf), Sutton and Barto, 2011 11 | -------------------------------------------------------------------------------- /docs/templates/agents/cem.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | {{autogenerated}} 6 | 7 | --- 8 | 9 | ### References 10 | - [Learning Tetris Using the Noisy Cross-Entropy Method](http://www.mitpressjournals.org/doi/abs/10.1162/neco.2006.18.12.2936?journalCode=neco), Szita et al., 2006 11 | - [Deep Reinforcement Learning (MLSS lecture notes)](http://learning.mpi-sws.org/mlss2016/slides/2016-MLSS-RL.pdf), Schulman, 2016 12 | -------------------------------------------------------------------------------- /docs/sources/agents/naf.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/dqn.py#L548) 6 | ### NAFAgent 7 | 8 | ```python 9 | rl.agents.dqn.NAFAgent(V_model, L_model, mu_model, random_process=None, covariance_mode='full') 10 | ``` 11 | 12 | Write me 13 | 14 | 15 | --- 16 | 17 | ### References 18 | - [Continuous Deep Q-Learning with Model-based Acceleration](https://arxiv.org/abs/1603.00748), Gu et al., 2016 19 | -------------------------------------------------------------------------------- /docs/sources/agents/sarsa.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/sarsa.py#L17) 6 | ### SARSAAgent 7 | 8 | ```python 9 | rl.agents.sarsa.SARSAAgent(model, nb_actions, policy=None, test_policy=None, gamma=0.99, nb_steps_warmup=10, train_interval=1, delta_clip=inf) 10 | ``` 11 | 12 | Write me 13 | 14 | 15 | --- 16 | 17 | ### References 18 | - [Reinforcement learning: An introduction](http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf), Sutton and Barto, 2011 19 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools import find_packages 3 | 4 | 5 | setup(name='keras-rl', 6 | version='0.3.1', 7 | description='Deep Reinforcement Learning for Keras', 8 | author='Matthias Plappert', 9 | author_email='matthiasplappert@me.com', 10 | url='https://github.com/matthiasplappert/keras-rl', 11 | download_url='https://github.com/matthiasplappert/keras-rl/archive/v0.3.1.tar.gz', 12 | license='MIT', 13 | install_requires=['keras>=1.0.7,<2.0.7'], 14 | extras_require={ 15 | 'gym': ['gym'], 16 | }, 17 | packages=find_packages()) 18 | -------------------------------------------------------------------------------- /docs/templates/agents/dqn.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | {{autogenerated}} 6 | 7 | --- 8 | 9 | ### References 10 | - [Playing Atari with Deep Reinforcement Learning](https://arxiv.org/abs/1312.5602), Mnih et al., 2013 11 | - [Human-level control through deep reinforcement learning](http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html), Mnih et al., 2015 12 | - [Deep Reinforcement Learning with Double Q-learning](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Applications_files/doubledqn.pdf), van Hasselt et al., 2015 13 | - [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581), Wang et al., 2016 14 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Keras-RL Documentation 2 | theme: readthedocs 3 | docs_dir: docs/sources 4 | repo_url: https://github.com/matthiasplappert/keras-rl 5 | site_description: 'Documentation for Keras-RL, a library for Deep Reinforcement Learning with Keras.' 6 | #markdown_extensions: [mdx_math] 7 | #extra_javascript: ['https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML'] 8 | 9 | dev_addr: '0.0.0.0:8000' 10 | 11 | pages: 12 | - Home: index.md 13 | - Core: core.md 14 | - Agents: 15 | - Overview: agents/overview.md 16 | - DQNAgent: agents/dqn.md 17 | - NAFAgent: agents/naf.md 18 | - DDPGAgent: agents/ddpg.md 19 | - SARSAAgent: agents/sarsa.md 20 | - CEMAgent: agents/cem.md 21 | -------------------------------------------------------------------------------- /docs/sources/agents/ddpg.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/ddpg.py#L22) 6 | ### DDPGAgent 7 | 8 | ```python 9 | rl.agents.ddpg.DDPGAgent(nb_actions, actor, critic, critic_action_input, memory, gamma=0.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, train_interval=1, memory_interval=1, delta_range=None, delta_clip=inf, random_process=None, custom_model_objects={}, target_model_update=0.001) 10 | ``` 11 | 12 | Write me 13 | 14 | 15 | --- 16 | 17 | ### References 18 | - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971), Lillicrap et al., 2015 19 | -------------------------------------------------------------------------------- /tests/rl/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | from rl.core import Env 5 | 6 | 7 | class MultiInputTestEnv(Env): 8 | def __init__(self, observation_shape): 9 | self.observation_shape = observation_shape 10 | 11 | def step(self, action): 12 | return self._get_obs(), random.choice([0, 1]), random.choice([True, False]), {} 13 | 14 | def reset(self): 15 | return self._get_obs() 16 | 17 | def _get_obs(self): 18 | if type(self.observation_shape) is list: 19 | return [np.random.random(s) for s in self.observation_shape] 20 | else: 21 | return np.random.random(self.observation_shape) 22 | 23 | def __del__(self): 24 | pass 25 | -------------------------------------------------------------------------------- /rl/keras_future.py: -------------------------------------------------------------------------------- 1 | import keras 2 | import keras.layers 3 | import keras.models 4 | 5 | 6 | def concatenate(x): 7 | if hasattr(keras.layers, 'Concatenate'): 8 | return keras.layers.Concatenate()(x) 9 | else: 10 | return keras.layers.merge(x, mode='concat') 11 | 12 | 13 | def add(x): 14 | if hasattr(keras.layers, 'Add'): 15 | return keras.layers.Add()(x) 16 | else: 17 | return keras.layers.merge(x, mode='sum') 18 | 19 | 20 | def Model(input, output, **kwargs): 21 | if int(keras.__version__.split('.')[0]) >= 2: 22 | return keras.models.Model(inputs=input, outputs=output, **kwargs) 23 | else: 24 | return keras.models.Model(input=input, output=output, **kwargs) 25 | -------------------------------------------------------------------------------- /docs/sources/agents/cem.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/cem.py#L12) 6 | ### CEMAgent 7 | 8 | ```python 9 | rl.agents.cem.CEMAgent(model, nb_actions, memory, batch_size=50, nb_steps_warmup=1000, train_interval=50, elite_frac=0.05, memory_interval=1, theta_init=None, noise_decay_const=0.0, noise_ampl=0.0) 10 | ``` 11 | 12 | Write me 13 | 14 | 15 | --- 16 | 17 | ### References 18 | - [Learning Tetris Using the Noisy Cross-Entropy Method](http://www.mitpressjournals.org/doi/abs/10.1162/neco.2006.18.12.2936?journalCode=neco), Szita et al., 2006 19 | - [Deep Reinforcement Learning (MLSS lecture notes)](http://learning.mpi-sws.org/mlss2016/slides/2016-MLSS-RL.pdf), Schulman, 2016 20 | -------------------------------------------------------------------------------- /ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Please make sure that the boxes below are checked before you submit your issue. If your issue is an implementation question, please ask your question in the [Keras-RL Google group](https://groups.google.com/forum/#!forum/keras-rl-users) or [join the Keras-RL Gitter channel](https://gitter.im/keras-rl/Lobby) and ask there instead of filing a GitHub issue. 2 | 3 | Thank you! 4 | 5 | - [ ] Check that you are up-to-date with the master branch of Keras-RL. You can update with: 6 | `pip install git+git://github.com/matthiasplappert/keras-rl.git --upgrade --no-deps` 7 | 8 | - [ ] Check that you are up-to-date with the master branch of Keras. You can update with: 9 | `pip install git+git://github.com/fchollet/keras.git --upgrade --no-deps` 10 | 11 | - [ ] Provide a link to a GitHub Gist of a Python script that can reproduce your issue (or just copy the script here if it is short). If you report an error, please include the error message and the backtrace. 12 | -------------------------------------------------------------------------------- /docs/sources/agents/dqn.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/dqn.py#L89) 6 | ### DQNAgent 7 | 8 | ```python 9 | rl.agents.dqn.DQNAgent(model, policy=None, test_policy=None, enable_double_dqn=True, enable_dueling_network=False, dueling_type='avg') 10 | ``` 11 | 12 | Write me 13 | 14 | 15 | --- 16 | 17 | ### References 18 | - [Playing Atari with Deep Reinforcement Learning](https://arxiv.org/abs/1312.5602), Mnih et al., 2013 19 | - [Human-level control through deep reinforcement learning](http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html), Mnih et al., 2015 20 | - [Deep Reinforcement Learning with Double Q-learning](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Applications_files/doubledqn.pdf), van Hasselt et al., 2015 21 | - [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581), Wang et al., 2016 22 | -------------------------------------------------------------------------------- /docs/templates/agents/overview.md: -------------------------------------------------------------------------------- 1 | ## Available Agents 2 | 3 | | Name | Implementation | Observation Space | Action Space | 4 | | ---------------------- |------------------------| -------------------| ---------------| 5 | | [DQN](/agents/dqn) | `rl.agents.DQNAgent` | discrete or continuous | discrete | 6 | | [DDPG](/agents/ddpg) | `rl.agents.DDPGAgent` | discrete or continuous | continuous | 7 | | [NAF](/agents/naf) | `rl.agents.NAFAgent` | discrete or continuous | continuous | 8 | | [CEM](/agents/cem) | `rl.agents.CEMAgent` | discrete or continuous | discrete | 9 | | [SARSA](/agents/sarsa) | `rl.agents.SARSAAgent` | discrete or continuous | discrete | 10 | 11 | --- 12 | 13 | ## Common API 14 | 15 | All agents share a common API. This allows you to easily switch between different agents. 16 | That being said, keep in mind that some agents make assumptions regarding the action space, i.e. assume discrete 17 | or continuous actions. 18 | 19 | {{autogenerated}} 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Matthias Plappert 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # OS X 2 | .DS_Store 3 | docs/site/* 4 | 5 | # Ubuntu 6 | *~ 7 | 8 | # PyCharm 9 | .idea 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | env/ 21 | build/ 22 | download/ 23 | bin/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | include/ 33 | lib/ 34 | man/ 35 | local/ 36 | var/ 37 | share/ 38 | pip-selfcheck.json 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *,cover 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # Configuration of py.test 2 | [pytest] 3 | addopts=-v 4 | -n 2 5 | --durations=10 6 | --cov-report term-missing 7 | --cov=rl 8 | 9 | # Do not run tests in the build folder or in the virtualenv folder `venv`. 10 | norecursedirs=build venv 11 | 12 | # PEP-8 The following are ignored: 13 | # E251 unexpected spaces around keyword / parameter equals 14 | # E225 missing whitespace around operator 15 | # E226 missing whitespace around arithmetic operator 16 | # W291 trailing whitespace 17 | # W293 blank line contains whitespace 18 | # E501 line too long (82 > 79 characters) 19 | # E402 module level import not at top of file - temporary measure to coninue adding ros python packaged in sys.path 20 | # E731 do not assign a lambda expression, use a def 21 | # E302 two blank lines between the functions 22 | # E231 missing whitespace after , 23 | # E241 multiple spaces after ',' 24 | # E261 at least two spaces before inline comment 25 | 26 | 27 | pep8ignore=* E251 \ 28 | * E225 \ 29 | * E226 \ 30 | * W291 \ 31 | * W293 \ 32 | * E501 \ 33 | * E402 \ 34 | * E731 \ 35 | * E302 \ 36 | * E231 \ 37 | * E241 \ 38 | * E261 39 | -------------------------------------------------------------------------------- /tests/rl/agents/test_cem.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import absolute_import 3 | 4 | import pytest 5 | import numpy as np 6 | from numpy.testing import assert_allclose 7 | 8 | from keras.models import Model, Sequential 9 | from keras.layers import Input, merge, Dense, Flatten 10 | 11 | from rl.agents.cem import CEMAgent 12 | from rl.memory import EpisodeParameterMemory 13 | from rl.processors import MultiInputProcessor 14 | 15 | from ..util import MultiInputTestEnv 16 | 17 | 18 | def test_single_cem_input(): 19 | model = Sequential() 20 | model.add(Flatten(input_shape=(2, 3))) 21 | model.add(Dense(2)) 22 | 23 | memory = EpisodeParameterMemory(limit=10, window_length=2) 24 | agent = CEMAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, train_interval=50) 25 | agent.compile() 26 | agent.fit(MultiInputTestEnv((3,)), nb_steps=100) 27 | 28 | 29 | def test_multi_cem_input(): 30 | input1 = Input(shape=(2, 3)) 31 | input2 = Input(shape=(2, 4)) 32 | x = merge([input1, input2], mode='concat') 33 | x = Flatten()(x) 34 | x = Dense(2)(x) 35 | model = Model(input=[input1, input2], output=x) 36 | 37 | memory = EpisodeParameterMemory(limit=10, window_length=2) 38 | processor = MultiInputProcessor(nb_inputs=2) 39 | agent = CEMAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, 40 | processor=processor, train_interval=50) 41 | agent.compile() 42 | agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=100) 43 | -------------------------------------------------------------------------------- /examples/sarsa_cartpole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Activation, Flatten 6 | from keras.optimizers import Adam 7 | 8 | from rl.agents import SARSAAgent 9 | from rl.policy import BoltzmannQPolicy 10 | 11 | 12 | ENV_NAME = 'CartPole-v0' 13 | 14 | # Get the environment and extract the number of actions. 15 | env = gym.make(ENV_NAME) 16 | np.random.seed(123) 17 | env.seed(123) 18 | nb_actions = env.action_space.n 19 | 20 | # Next, we build a very simple model. 21 | model = Sequential() 22 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 23 | model.add(Dense(16)) 24 | model.add(Activation('relu')) 25 | model.add(Dense(16)) 26 | model.add(Activation('relu')) 27 | model.add(Dense(16)) 28 | model.add(Activation('relu')) 29 | model.add(Dense(nb_actions)) 30 | model.add(Activation('linear')) 31 | print(model.summary()) 32 | 33 | # SARSA does not require a memory. 34 | policy = BoltzmannQPolicy() 35 | sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) 36 | sarsa.compile(Adam(lr=1e-3), metrics=['mae']) 37 | 38 | # Okay, now it's time to learn something! We visualize the training here for show, but this 39 | # slows down training quite a lot. You can always safely abort the training prematurely using 40 | # Ctrl + C. 41 | sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2) 42 | 43 | # After training is done, we save the final weights. 44 | sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True) 45 | 46 | # Finally, evaluate our algorithm for 5 episodes. 47 | sarsa.test(env, nb_episodes=5, visualize=True) 48 | -------------------------------------------------------------------------------- /examples/visualize_log.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def visualize_log(filename, figsize=None, output=None): 8 | with open(filename, 'r') as f: 9 | data = json.load(f) 10 | if 'episode' not in data: 11 | raise ValueError('Log file "{}" does not contain the "episode" key.'.format(filename)) 12 | episodes = data['episode'] 13 | 14 | # Get value keys. The x axis is shared and is the number of episodes. 15 | keys = sorted(list(set(data.keys()).difference(set(['episode'])))) 16 | 17 | if figsize is None: 18 | figsize = (15., 5. * len(keys)) 19 | f, axarr = plt.subplots(len(keys), sharex=True, figsize=figsize) 20 | for idx, key in enumerate(keys): 21 | axarr[idx].plot(episodes, data[key]) 22 | axarr[idx].set_ylabel(key) 23 | plt.xlabel('episodes') 24 | plt.tight_layout() 25 | if output is None: 26 | plt.show() 27 | else: 28 | plt.savefig(output) 29 | 30 | 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('filename', type=str, help='The filename of the JSON log generated during training.') 33 | parser.add_argument('--output', type=str, default=None, help='The output file. If not specified, the log will only be displayed.') 34 | parser.add_argument('--figsize', nargs=2, type=float, default=None, help='The size of the figure in `width height` format specified in points.') 35 | args = parser.parse_args() 36 | 37 | # You can use visualize_log to easily view the stats that were recorded during training. Simply 38 | # provide the filename of the `FileLogger` that was used in `FileLogger`. 39 | visualize_log(args.filename, output=args.output, figsize=args.figsize) 40 | -------------------------------------------------------------------------------- /examples/dqn_cartpole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Activation, Flatten 6 | from keras.optimizers import Adam 7 | 8 | from rl.agents.dqn import DQNAgent 9 | from rl.policy import BoltzmannQPolicy 10 | from rl.memory import SequentialMemory 11 | 12 | 13 | ENV_NAME = 'CartPole-v0' 14 | 15 | 16 | # Get the environment and extract the number of actions. 17 | env = gym.make(ENV_NAME) 18 | np.random.seed(123) 19 | env.seed(123) 20 | nb_actions = env.action_space.n 21 | 22 | # Next, we build a very simple model. 23 | model = Sequential() 24 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 25 | model.add(Dense(16)) 26 | model.add(Activation('relu')) 27 | model.add(Dense(16)) 28 | model.add(Activation('relu')) 29 | model.add(Dense(16)) 30 | model.add(Activation('relu')) 31 | model.add(Dense(nb_actions)) 32 | model.add(Activation('linear')) 33 | print(model.summary()) 34 | 35 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 36 | # even the metrics! 37 | memory = SequentialMemory(limit=50000, window_length=1) 38 | policy = BoltzmannQPolicy() 39 | dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, 40 | target_model_update=1e-2, policy=policy) 41 | dqn.compile(Adam(lr=1e-3), metrics=['mae']) 42 | 43 | # Okay, now it's time to learn something! We visualize the training here for show, but this 44 | # slows down training quite a lot. You can always safely abort the training prematurely using 45 | # Ctrl + C. 46 | dqn.fit(env, nb_steps=50000, visualize=True, verbose=2) 47 | 48 | # After training is done, we save the final weights. 49 | dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) 50 | 51 | # Finally, evaluate our algorithm for 5 episodes. 52 | dqn.test(env, nb_episodes=5, visualize=True) 53 | -------------------------------------------------------------------------------- /docs/sources/processors.md: -------------------------------------------------------------------------------- 1 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/processors.py#L7) 2 | ### MultiInputProcessor 3 | 4 | ```python 5 | rl.processors.MultiInputProcessor(nb_inputs) 6 | ``` 7 | 8 | Converts observations from an environment with multiple observations for use in a neural network 9 | policy. 10 | 11 | In some cases, you have environments that return multiple different observations per timestep 12 | (in a robotics context, for example, a camera may be used to view the scene and a joint encoder may 13 | be used to report the angles for each joint). Usually, this can be handled by a policy that has 14 | multiple inputs, one for each modality. However, observations are returned by the environment 15 | in the form of a tuple `[(modality1_t, modality2_t, ..., modalityn_t) for t in T]` but the neural network 16 | expects them in per-modality batches like so: `[[modality1_1, ..., modality1_T], ..., [[modalityn_1, ..., modalityn_T]]`. 17 | This processor converts observations appropriate for this use case. 18 | 19 | __Arguments__ 20 | 21 | - __nb_inputs__ (integer): The number of inputs, that is different modalities, to be used. 22 | Your neural network that you use for the policy must have a corresponding number of 23 | inputs. 24 | 25 | ---- 26 | 27 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/processors.py#L40) 28 | ### WhiteningNormalizerProcessor 29 | 30 | ```python 31 | rl.processors.WhiteningNormalizerProcessor() 32 | ``` 33 | 34 | Normalizes the observations to have zero mean and standard deviation of one, 35 | i.e. it applies whitening to the inputs. 36 | 37 | This typically helps significantly with learning, especially if different dimensions are 38 | on different scales. However, it complicates training in the sense that you will have to store 39 | these weights alongside the policy if you intend to load it later. It is the responsibility of 40 | the user to do so. 41 | 42 | -------------------------------------------------------------------------------- /examples/cem_cartpole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Activation, Flatten 6 | from keras.optimizers import Adam 7 | 8 | from rl.agents.cem import CEMAgent 9 | from rl.memory import EpisodeParameterMemory 10 | 11 | ENV_NAME = 'CartPole-v0' 12 | 13 | 14 | # Get the environment and extract the number of actions. 15 | env = gym.make(ENV_NAME) 16 | np.random.seed(123) 17 | env.seed(123) 18 | 19 | nb_actions = env.action_space.n 20 | obs_dim = env.observation_space.shape[0] 21 | 22 | # Option 1 : Simple model 23 | model = Sequential() 24 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 25 | model.add(Dense(nb_actions)) 26 | model.add(Activation('softmax')) 27 | 28 | # Option 2: deep network 29 | # model = Sequential() 30 | # model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 31 | # model.add(Dense(16)) 32 | # model.add(Activation('relu')) 33 | # model.add(Dense(16)) 34 | # model.add(Activation('relu')) 35 | # model.add(Dense(16)) 36 | # model.add(Activation('relu')) 37 | # model.add(Dense(nb_actions)) 38 | # model.add(Activation('softmax')) 39 | 40 | 41 | print(model.summary()) 42 | 43 | 44 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 45 | # even the metrics! 46 | memory = EpisodeParameterMemory(limit=1000, window_length=1) 47 | 48 | cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, 49 | batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) 50 | cem.compile() 51 | 52 | # Okay, now it's time to learn something! We visualize the training here for show, but this 53 | # slows down training quite a lot. You can always safely abort the training prematurely using 54 | # Ctrl + C. 55 | cem.fit(env, nb_steps=100000, visualize=False, verbose=2) 56 | 57 | # After training is done, we save the best weights. 58 | cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True) 59 | 60 | # Finally, evaluate our algorithm for 5 episodes. 61 | cem.test(env, nb_episodes=5, visualize=True) 62 | -------------------------------------------------------------------------------- /examples/duel_dqn_cartpole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Activation, Flatten 6 | from keras.optimizers import Adam 7 | 8 | from rl.agents.dqn import DQNAgent 9 | from rl.policy import BoltzmannQPolicy 10 | from rl.memory import SequentialMemory 11 | 12 | 13 | ENV_NAME = 'CartPole-v0' 14 | 15 | 16 | # Get the environment and extract the number of actions. 17 | env = gym.make(ENV_NAME) 18 | np.random.seed(123) 19 | env.seed(123) 20 | nb_actions = env.action_space.n 21 | 22 | # Next, we build a very simple model regardless of the dueling architecture 23 | # if you enable dueling network in DQN , DQN will build a dueling network base on your model automatically 24 | # Also, you can build a dueling network by yourself and turn off the dueling network in DQN. 25 | model = Sequential() 26 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 27 | model.add(Dense(16)) 28 | model.add(Activation('relu')) 29 | model.add(Dense(16)) 30 | model.add(Activation('relu')) 31 | model.add(Dense(16)) 32 | model.add(Activation('relu')) 33 | model.add(Dense(nb_actions, activation='linear')) 34 | print(model.summary()) 35 | 36 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 37 | # even the metrics! 38 | memory = SequentialMemory(limit=50000, window_length=1) 39 | policy = BoltzmannQPolicy() 40 | # enable the dueling network 41 | # you can specify the dueling_type to one of {'avg','max','naive'} 42 | dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, 43 | enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy) 44 | dqn.compile(Adam(lr=1e-3), metrics=['mae']) 45 | 46 | # Okay, now it's time to learn something! We visualize the training here for show, but this 47 | # slows down training quite a lot. You can always safely abort the training prematurely using 48 | # Ctrl + C. 49 | dqn.fit(env, nb_steps=50000, visualize=False, verbose=2) 50 | 51 | # After training is done, we save the final weights. 52 | dqn.save_weights('duel_dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) 53 | 54 | # Finally, evaluate our algorithm for 5 episodes. 55 | dqn.test(env, nb_episodes=5, visualize=False) 56 | -------------------------------------------------------------------------------- /rl/random.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | 4 | 5 | class RandomProcess(object): 6 | def reset_states(self): 7 | pass 8 | 9 | 10 | class AnnealedGaussianProcess(RandomProcess): 11 | def __init__(self, mu, sigma, sigma_min, n_steps_annealing): 12 | self.mu = mu 13 | self.sigma = sigma 14 | self.n_steps = 0 15 | 16 | if sigma_min is not None: 17 | self.m = -float(sigma - sigma_min) / float(n_steps_annealing) 18 | self.c = sigma 19 | self.sigma_min = sigma_min 20 | else: 21 | self.m = 0. 22 | self.c = sigma 23 | self.sigma_min = sigma 24 | 25 | @property 26 | def current_sigma(self): 27 | sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c) 28 | return sigma 29 | 30 | 31 | class GaussianWhiteNoiseProcess(AnnealedGaussianProcess): 32 | def __init__(self, mu=0., sigma=1., sigma_min=None, n_steps_annealing=1000, size=1): 33 | super(GaussianWhiteNoiseProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing) 34 | self.size = size 35 | 36 | def sample(self): 37 | sample = np.random.normal(self.mu, self.current_sigma, self.size) 38 | self.n_steps += 1 39 | return sample 40 | 41 | # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab 42 | class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess): 43 | def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000): 44 | super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing) 45 | self.theta = theta 46 | self.mu = mu 47 | self.dt = dt 48 | self.x0 = x0 49 | self.size = size 50 | self.reset_states() 51 | 52 | def sample(self): 53 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size) 54 | self.x_prev = x 55 | self.n_steps += 1 56 | return x 57 | 58 | def reset_states(self): 59 | self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size) 60 | -------------------------------------------------------------------------------- /docs/sources/core.md: -------------------------------------------------------------------------------- 1 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L11) 2 | ### Agent 3 | 4 | ```python 5 | rl.core.Agent(processor=None) 6 | ``` 7 | 8 | Abstract base class for all implemented agents. 9 | 10 | Each agent interacts with the environment (as defined by the `Env` class) by first observing the 11 | state of the environment. Based on this observation the agent changes the environment by performing 12 | an action. 13 | 14 | Do not use this abstract base class directly but instead use one of the concrete agents implemented. 15 | Each agent realizes a reinforcement learning algorithm. Since all agents conform to the same 16 | interface, you can use them interchangeably. 17 | 18 | To implement your own agent, you have to implement the following methods: 19 | 20 | - `forward` 21 | - `backward` 22 | - `compile` 23 | - `load_weights` 24 | - `save_weights` 25 | - `layers` 26 | 27 | __Arguments__ 28 | 29 | - __processor__ (`Processor` instance): See [Processor](#processor) for details. 30 | 31 | ---- 32 | 33 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L454) 34 | ### Processor 35 | 36 | ```python 37 | rl.core.Processor() 38 | ``` 39 | 40 | Abstract base class for implementing processors. 41 | 42 | A processor acts as a coupling mechanism between an `Agent` and its `Env`. This can 43 | be necessary if your agent has different requirements with respect to the form of the 44 | observations, actions, and rewards of the environment. By implementing a custom processor, 45 | you can effectively translate between the two without having to change the underlaying 46 | implementation of the agent or environment. 47 | 48 | Do not use this abstract base class directly but instead use one of the concrete implementations 49 | or write your own. 50 | 51 | ---- 52 | 53 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L533) 54 | ### Env 55 | 56 | ```python 57 | rl.core.Env() 58 | ``` 59 | 60 | The abstract environment class that is used by all agents. This class has the exact 61 | same API that OpenAI Gym uses so that integrating with it is trivial. In contrast to the 62 | OpenAI Gym implementation, this class only defines the abstract methods without any actual 63 | implementation. 64 | 65 | ---- 66 | 67 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L609) 68 | ### Space 69 | 70 | ```python 71 | rl.core.Space() 72 | ``` 73 | 74 | Abstract model for a space that is used for the state and action spaces. This class has the 75 | exact same API that OpenAI Gym uses so that integrating with it is trivial. 76 | 77 | -------------------------------------------------------------------------------- /rl/processors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rl.core import Processor 4 | from rl.util import WhiteningNormalizer 5 | 6 | 7 | class MultiInputProcessor(Processor): 8 | """Converts observations from an environment with multiple observations for use in a neural network 9 | policy. 10 | 11 | In some cases, you have environments that return multiple different observations per timestep 12 | (in a robotics context, for example, a camera may be used to view the scene and a joint encoder may 13 | be used to report the angles for each joint). Usually, this can be handled by a policy that has 14 | multiple inputs, one for each modality. However, observations are returned by the environment 15 | in the form of a tuple `[(modality1_t, modality2_t, ..., modalityn_t) for t in T]` but the neural network 16 | expects them in per-modality batches like so: `[[modality1_1, ..., modality1_T], ..., [[modalityn_1, ..., modalityn_T]]`. 17 | This processor converts observations appropriate for this use case. 18 | 19 | # Arguments 20 | nb_inputs (integer): The number of inputs, that is different modalities, to be used. 21 | Your neural network that you use for the policy must have a corresponding number of 22 | inputs. 23 | """ 24 | def __init__(self, nb_inputs): 25 | self.nb_inputs = nb_inputs 26 | 27 | def process_state_batch(self, state_batch): 28 | input_batches = [[] for x in range(self.nb_inputs)] 29 | for state in state_batch: 30 | processed_state = [[] for x in range(self.nb_inputs)] 31 | for observation in state: 32 | assert len(observation) == self.nb_inputs 33 | for o, s in zip(observation, processed_state): 34 | s.append(o) 35 | for idx, s in enumerate(processed_state): 36 | input_batches[idx].append(s) 37 | return [np.array(x) for x in input_batches] 38 | 39 | 40 | class WhiteningNormalizerProcessor(Processor): 41 | """Normalizes the observations to have zero mean and standard deviation of one, 42 | i.e. it applies whitening to the inputs. 43 | 44 | This typically helps significantly with learning, especially if different dimensions are 45 | on different scales. However, it complicates training in the sense that you will have to store 46 | these weights alongside the policy if you intend to load it later. It is the responsibility of 47 | the user to do so. 48 | """ 49 | def __init__(self): 50 | self.normalizer = None 51 | 52 | def process_state_batch(self, batch): 53 | if self.normalizer is None: 54 | self.normalizer = WhiteningNormalizer(shape=batch.shape[1:], dtype=batch.dtype) 55 | self.normalizer.update(batch) 56 | return self.normalizer.normalize(batch) 57 | -------------------------------------------------------------------------------- /examples/ddpg_pendulum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | from keras.models import Sequential, Model 5 | from keras.layers import Dense, Activation, Flatten, Input, merge 6 | from keras.optimizers import Adam 7 | 8 | from rl.agents import DDPGAgent 9 | from rl.memory import SequentialMemory 10 | from rl.random import OrnsteinUhlenbeckProcess 11 | 12 | 13 | ENV_NAME = 'Pendulum-v0' 14 | gym.undo_logger_setup() 15 | 16 | 17 | # Get the environment and extract the number of actions. 18 | env = gym.make(ENV_NAME) 19 | np.random.seed(123) 20 | env.seed(123) 21 | assert len(env.action_space.shape) == 1 22 | nb_actions = env.action_space.shape[0] 23 | 24 | # Next, we build a very simple model. 25 | actor = Sequential() 26 | actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 27 | actor.add(Dense(16)) 28 | actor.add(Activation('relu')) 29 | actor.add(Dense(16)) 30 | actor.add(Activation('relu')) 31 | actor.add(Dense(16)) 32 | actor.add(Activation('relu')) 33 | actor.add(Dense(nb_actions)) 34 | actor.add(Activation('linear')) 35 | print(actor.summary()) 36 | 37 | action_input = Input(shape=(nb_actions,), name='action_input') 38 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') 39 | flattened_observation = Flatten()(observation_input) 40 | x = merge([action_input, flattened_observation], mode='concat') 41 | x = Dense(32)(x) 42 | x = Activation('relu')(x) 43 | x = Dense(32)(x) 44 | x = Activation('relu')(x) 45 | x = Dense(32)(x) 46 | x = Activation('relu')(x) 47 | x = Dense(1)(x) 48 | x = Activation('linear')(x) 49 | critic = Model(input=[action_input, observation_input], output=x) 50 | print(critic.summary()) 51 | 52 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 53 | # even the metrics! 54 | memory = SequentialMemory(limit=100000, window_length=1) 55 | random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) 56 | agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, 57 | memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, 58 | random_process=random_process, gamma=.99, target_model_update=1e-3) 59 | agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) 60 | 61 | # Okay, now it's time to learn something! We visualize the training here for show, but this 62 | # slows down training quite a lot. You can always safely abort the training prematurely using 63 | # Ctrl + C. 64 | agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=200) 65 | 66 | # After training is done, we save the final weights. 67 | agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) 68 | 69 | # Finally, evaluate our algorithm for 5 episodes. 70 | agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200) 71 | -------------------------------------------------------------------------------- /tests/rl/agents/test_ddpg.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import absolute_import 3 | 4 | import pytest 5 | import numpy as np 6 | from numpy.testing import assert_allclose 7 | 8 | from keras.models import Model, Sequential 9 | from keras.layers import Input, merge, Dense, Flatten 10 | 11 | from rl.agents.ddpg import DDPGAgent 12 | from rl.memory import SequentialMemory 13 | from rl.processors import MultiInputProcessor 14 | 15 | from ..util import MultiInputTestEnv 16 | 17 | 18 | def test_single_ddpg_input(): 19 | nb_actions = 2 20 | 21 | actor = Sequential() 22 | actor.add(Flatten(input_shape=(2, 3))) 23 | actor.add(Dense(nb_actions)) 24 | 25 | action_input = Input(shape=(nb_actions,), name='action_input') 26 | observation_input = Input(shape=(2, 3), name='observation_input') 27 | x = merge([action_input, Flatten()(observation_input)], mode='concat') 28 | x = Dense(1)(x) 29 | critic = Model(input=[action_input, observation_input], output=x) 30 | 31 | memory = SequentialMemory(limit=10, window_length=2) 32 | agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=action_input, memory=memory, 33 | nb_actions=2, nb_steps_warmup_critic=5, nb_steps_warmup_actor=5, batch_size=4) 34 | agent.compile('sgd') 35 | agent.fit(MultiInputTestEnv((3,)), nb_steps=10) 36 | 37 | 38 | def test_multi_ddpg_input(): 39 | nb_actions = 2 40 | 41 | actor_observation_input1 = Input(shape=(2, 3), name='actor_observation_input1') 42 | actor_observation_input2 = Input(shape=(2, 4), name='actor_observation_input2') 43 | actor = Sequential() 44 | x = merge([actor_observation_input1, actor_observation_input2], mode='concat') 45 | x = Flatten()(x) 46 | x = Dense(nb_actions)(x) 47 | actor = Model(input=[actor_observation_input1, actor_observation_input2], output=x) 48 | 49 | action_input = Input(shape=(nb_actions,), name='action_input') 50 | critic_observation_input1 = Input(shape=(2, 3), name='critic_observation_input1') 51 | critic_observation_input2 = Input(shape=(2, 4), name='critic_observation_input2') 52 | x = merge([critic_observation_input1, critic_observation_input2], mode='concat') 53 | x = merge([action_input, Flatten()(x)], mode='concat') 54 | x = Dense(1)(x) 55 | critic = Model(input=[action_input, critic_observation_input1, critic_observation_input2], output=x) 56 | 57 | processor = MultiInputProcessor(nb_inputs=2) 58 | memory = SequentialMemory(limit=10, window_length=2) 59 | agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=action_input, memory=memory, 60 | nb_actions=2, nb_steps_warmup_critic=5, nb_steps_warmup_actor=5, batch_size=4, 61 | processor=processor) 62 | agent.compile('sgd') 63 | agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10) 64 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: trusty 3 | language: python 4 | matrix: 5 | include: 6 | - python: 3.5 7 | env: KERAS_BACKEND=theano 8 | - python: 3.5 9 | env: KERAS_BACKEND=tensorflow 10 | - python: 2.7 11 | env: KERAS_BACKEND=theano 12 | - python: 2.7 13 | env: KERAS_BACKEND=tensorflow 14 | - python: 2.7 15 | env: KERAS_BACKEND=tensorflow LEGACY_KERAS=1 16 | - python: 2.7 17 | env: KERAS_BACKEND=tensorflow TEST_MODE=PEP8 18 | - python: 2.7 19 | env: KERAS_BACKEND=theano TEST_MODE=INTEGRATION 20 | - python: 3.5 21 | env: KERAS_BACKEND=theano TEST_MODE=INTEGRATION 22 | - python: 2.7 23 | env: KERAS_BACKEND=tensorflow TEST_MODE=INTEGRATION 24 | - python: 3.5 25 | env: KERAS_BACKEND=tensorflow TEST_MODE=INTEGRATION 26 | install: 27 | # Adopted from https://github.com/fchollet/keras/blob/master/.travis.yml. 28 | - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then 29 | wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; 30 | else 31 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 32 | fi 33 | - bash miniconda.sh -b -p $HOME/miniconda 34 | - export PATH="$HOME/miniconda/bin:$PATH" 35 | - hash -r 36 | - conda config --set always_yes yes --set changeps1 no 37 | - conda update -q conda 38 | # Useful for debugging any issues with conda 39 | - conda info -a 40 | 41 | - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib pandas pytest h5py 42 | - source activate test-environment 43 | - pip install pytest-xdist 44 | # See https://github.com/pytest-dev/pytest-cov/issues/124 for details 45 | - pip install pytest-cov==2.2.1 python-coveralls coverage==3.7.1 46 | - pip install pep8 pytest-pep8 47 | - pip install tensorflow 48 | # Bleeding-edge: pip install git+https://github.com/Theano/Theano.git 49 | - pip install theano>=0.9.0rc1 50 | - pip install gym 51 | # Bleeding-edge: pip install git+https://github.com/fchollet/keras.git; 52 | - if [[ "$LEGACY_KERAS" == "1" ]]; then 53 | pip install keras==1.2.2; 54 | else 55 | pip install "keras<2.0.7"; 56 | fi 57 | 58 | - python setup.py install 59 | 60 | # command to run tests. 61 | script: 62 | # Run keras backend init to initialize backend config. 63 | - python -c "import keras.backend" 64 | # Set up keras backend 65 | - sed -i -e 's/"backend":[[:space:]]*"[^"]*/"backend":\ "'$KERAS_BACKEND'/g' ~/.keras/keras.json; 66 | - echo -e "Running tests with the following config:\n$(cat ~/.keras/keras.json)" 67 | - if [[ "$TEST_MODE" == "INTEGRATION" ]]; then 68 | PYTHONPATH=$PWD:$PYTHONPATH py.test tests/integration; 69 | elif [[ "$TEST_MODE" == "PEP8" ]]; then 70 | PYTHONPATH=$PWD:$PYTHONPATH py.test --pep8 -m pep8 -n0; 71 | else 72 | PYTHONPATH=$PWD:$PYTHONPATH py.test tests/; 73 | fi 74 | after_success: 75 | - coveralls 76 | -------------------------------------------------------------------------------- /examples/ddpg_mujoco.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import gym 4 | from gym import wrappers 5 | 6 | from keras.models import Sequential, Model 7 | from keras.layers import Dense, Activation, Flatten, Input 8 | from keras.optimizers import Adam 9 | 10 | from rl.processors import WhiteningNormalizerProcessor 11 | from rl.agents import DDPGAgent 12 | from rl.memory import SequentialMemory 13 | from rl.random import OrnsteinUhlenbeckProcess 14 | from rl.keras_future import concatenate 15 | 16 | 17 | class MujocoProcessor(WhiteningNormalizerProcessor): 18 | def process_action(self, action): 19 | return np.clip(action, -1., 1.) 20 | 21 | 22 | ENV_NAME = 'HalfCheetah-v1' 23 | gym.undo_logger_setup() 24 | 25 | 26 | # Get the environment and extract the number of actions. 27 | env = gym.make(ENV_NAME) 28 | env = wrappers.Monitor(env, '/tmp/{}'.format(ENV_NAME), force=True) 29 | np.random.seed(123) 30 | env.seed(123) 31 | assert len(env.action_space.shape) == 1 32 | nb_actions = env.action_space.shape[0] 33 | 34 | # Next, we build a very simple model. 35 | actor = Sequential() 36 | actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 37 | actor.add(Dense(400)) 38 | actor.add(Activation('relu')) 39 | actor.add(Dense(300)) 40 | actor.add(Activation('relu')) 41 | actor.add(Dense(nb_actions)) 42 | actor.add(Activation('tanh')) 43 | print(actor.summary()) 44 | 45 | action_input = Input(shape=(nb_actions,), name='action_input') 46 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') 47 | flattened_observation = Flatten()(observation_input) 48 | x = Dense(400)(flattened_observation) 49 | x = Activation('relu')(x) 50 | x = concatenate([x, action_input]) 51 | x = Dense(300)(x) 52 | x = Activation('relu')(x) 53 | x = Dense(1)(x) 54 | x = Activation('linear')(x) 55 | critic = Model(input=[action_input, observation_input], output=x) 56 | print(critic.summary()) 57 | 58 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 59 | # even the metrics! 60 | memory = SequentialMemory(limit=100000, window_length=1) 61 | random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1) 62 | agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, 63 | memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, 64 | random_process=random_process, gamma=.99, target_model_update=1e-3, 65 | processor=MujocoProcessor()) 66 | agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae']) 67 | 68 | # Okay, now it's time to learn something! We visualize the training here for show, but this 69 | # slows down training quite a lot. You can always safely abort the training prematurely using 70 | # Ctrl + C. 71 | agent.fit(env, nb_steps=1000000, visualize=False, verbose=1) 72 | 73 | # After training is done, we save the final weights. 74 | agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) 75 | 76 | # Finally, evaluate our algorithm for 5 episodes. 77 | agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200) 78 | -------------------------------------------------------------------------------- /tests/rl/test_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pytest 3 | import numpy as np 4 | from numpy.testing import assert_allclose 5 | 6 | from keras.models import Model, Sequential 7 | from keras.layers import Input, Dense, merge 8 | from keras.optimizers import SGD 9 | import keras.backend as K 10 | 11 | from rl.util import clone_optimizer, clone_model, huber_loss, WhiteningNormalizer 12 | 13 | 14 | def test_clone_sequential_model(): 15 | seq = Sequential() 16 | seq.add(Dense(8, input_shape=(3,))) 17 | seq.compile(optimizer='sgd', loss='mse') 18 | 19 | clone = clone_model(seq) 20 | clone.compile(optimizer='sgd', loss='mse') 21 | 22 | ins = np.random.random((4, 3)) 23 | y_pred_seq = seq.predict_on_batch(ins) 24 | y_pred_clone = clone.predict_on_batch(ins) 25 | assert y_pred_seq.shape == y_pred_clone.shape 26 | assert_allclose(y_pred_seq, y_pred_clone) 27 | 28 | 29 | def test_clone_graph_model(): 30 | in1 = Input(shape=(2,)) 31 | in2 = Input(shape=(3,)) 32 | x = Dense(8)(merge([in1, in2], mode='concat')) 33 | graph = Model([in1, in2], x) 34 | graph.compile(optimizer='sgd', loss='mse') 35 | 36 | clone = clone_model(graph) 37 | clone.compile(optimizer='sgd', loss='mse') 38 | 39 | ins = [np.random.random((4, 2)), np.random.random((4, 3))] 40 | y_pred_graph = graph.predict_on_batch(ins) 41 | y_pred_clone = clone.predict_on_batch(ins) 42 | assert y_pred_graph.shape == y_pred_clone.shape 43 | assert_allclose(y_pred_graph, y_pred_clone) 44 | 45 | 46 | def test_clone_optimizer(): 47 | lr, momentum, clipnorm, clipvalue = np.random.random(size=4) 48 | optimizer = SGD(lr=lr, momentum=momentum, clipnorm=clipnorm, clipvalue=clipvalue) 49 | clone = clone_optimizer(optimizer) 50 | 51 | assert isinstance(clone, SGD) 52 | assert K.get_value(optimizer.lr) == K.get_value(clone.lr) 53 | assert K.get_value(optimizer.momentum) == K.get_value(clone.momentum) 54 | assert optimizer.clipnorm == clone.clipnorm 55 | assert optimizer.clipvalue == clone.clipvalue 56 | 57 | 58 | def test_clone_optimizer_from_string(): 59 | clone = clone_optimizer('sgd') 60 | assert isinstance(clone, SGD) 61 | 62 | 63 | def test_huber_loss(): 64 | a = np.array([1., 1.5, 2., 4.]) 65 | b = np.array([1.5, 1., 4., 2.]) 66 | assert_allclose(K.eval(huber_loss(a, b, 1.)), np.array([.125, .125, 1.5, 1.5])) 67 | assert_allclose(K.eval(huber_loss(a, b, 3.)), np.array([.125, .125, 2., 2.])) 68 | assert_allclose(K.eval(huber_loss(a, b, np.inf)), np.array([.125, .125, 2., 2.])) 69 | 70 | 71 | def test_whitening_normalizer(): 72 | x = np.random.normal(loc=.2, scale=2., size=(1000, 5)) 73 | normalizer = WhiteningNormalizer(shape=(5,)) 74 | normalizer.update(x[:500]) 75 | normalizer.update(x[500:]) 76 | 77 | assert_allclose(normalizer.mean, np.mean(x, axis=0)) 78 | assert_allclose(normalizer.std, np.std(x, axis=0)) 79 | 80 | x_norm = normalizer.normalize(x) 81 | assert_allclose(np.mean(x_norm, axis=0), np.zeros(5, dtype=normalizer.dtype), atol=1e-5) 82 | assert_allclose(np.std(x_norm, axis=0), np.ones(5, dtype=normalizer.dtype), atol=1e-5) 83 | 84 | x_denorm = normalizer.denormalize(x_norm) 85 | assert_allclose(x_denorm, x) 86 | 87 | 88 | if __name__ == '__main__': 89 | pytest.main([__file__]) 90 | -------------------------------------------------------------------------------- /examples/naf_pendulum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Activation, Flatten, Input 6 | from keras.optimizers import Adam 7 | 8 | from rl.agents import NAFAgent 9 | from rl.memory import SequentialMemory 10 | from rl.random import OrnsteinUhlenbeckProcess 11 | from rl.core import Processor 12 | from rl.keras_future import concatenate, Model 13 | 14 | class PendulumProcessor(Processor): 15 | def process_reward(self, reward): 16 | # The magnitude of the reward can be important. Since each step yields a relatively 17 | # high reward, we reduce the magnitude by two orders. 18 | return reward / 100. 19 | 20 | 21 | ENV_NAME = 'Pendulum-v0' 22 | gym.undo_logger_setup() 23 | 24 | 25 | # Get the environment and extract the number of actions. 26 | env = gym.make(ENV_NAME) 27 | np.random.seed(123) 28 | env.seed(123) 29 | assert len(env.action_space.shape) == 1 30 | nb_actions = env.action_space.shape[0] 31 | 32 | # Build all necessary models: V, mu, and L networks. 33 | V_model = Sequential() 34 | V_model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 35 | V_model.add(Dense(16)) 36 | V_model.add(Activation('relu')) 37 | V_model.add(Dense(16)) 38 | V_model.add(Activation('relu')) 39 | V_model.add(Dense(16)) 40 | V_model.add(Activation('relu')) 41 | V_model.add(Dense(1)) 42 | V_model.add(Activation('linear')) 43 | print(V_model.summary()) 44 | 45 | mu_model = Sequential() 46 | mu_model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 47 | mu_model.add(Dense(16)) 48 | mu_model.add(Activation('relu')) 49 | mu_model.add(Dense(16)) 50 | mu_model.add(Activation('relu')) 51 | mu_model.add(Dense(16)) 52 | mu_model.add(Activation('relu')) 53 | mu_model.add(Dense(nb_actions)) 54 | mu_model.add(Activation('linear')) 55 | print(mu_model.summary()) 56 | 57 | action_input = Input(shape=(nb_actions,), name='action_input') 58 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') 59 | x = concatenate([action_input, Flatten()(observation_input)]) 60 | x = Dense(32)(x) 61 | x = Activation('relu')(x) 62 | x = Dense(32)(x) 63 | x = Activation('relu')(x) 64 | x = Dense(32)(x) 65 | x = Activation('relu')(x) 66 | x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x) 67 | x = Activation('linear')(x) 68 | L_model = Model(input=[action_input, observation_input], output=x) 69 | print(L_model.summary()) 70 | 71 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 72 | # even the metrics! 73 | processor = PendulumProcessor() 74 | memory = SequentialMemory(limit=100000, window_length=1) 75 | random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions) 76 | agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, 77 | memory=memory, nb_steps_warmup=100, random_process=random_process, 78 | gamma=.99, target_model_update=1e-3, processor=processor) 79 | agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) 80 | 81 | # Okay, now it's time to learn something! We visualize the training here for show, but this 82 | # slows down training quite a lot. You can always safely abort the training prematurely using 83 | # Ctrl + C. 84 | agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=200) 85 | 86 | # After training is done, we save the final weights. 87 | agent.save_weights('cdqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) 88 | 89 | # Finally, evaluate our algorithm for 5 episodes. 90 | agent.test(env, nb_episodes=10, visualize=True, nb_max_episode_steps=200) 91 | -------------------------------------------------------------------------------- /tests/integration/test_continuous.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import gym 5 | 6 | from keras.models import Sequential 7 | from keras.layers import Dense, Activation, Flatten, Input 8 | from keras.optimizers import Adam 9 | 10 | from rl.agents import NAFAgent, DDPGAgent 11 | from rl.random import OrnsteinUhlenbeckProcess 12 | from rl.memory import SequentialMemory 13 | from rl.keras_future import Model, concatenate 14 | 15 | 16 | def test_cdqn(): 17 | # TODO: replace this with a simpler environment where we can actually test if it finds a solution 18 | env = gym.make('Pendulum-v0') 19 | np.random.seed(123) 20 | env.seed(123) 21 | random.seed(123) 22 | nb_actions = env.action_space.shape[0] 23 | 24 | V_model = Sequential() 25 | V_model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 26 | V_model.add(Dense(16)) 27 | V_model.add(Activation('relu')) 28 | V_model.add(Dense(1)) 29 | 30 | mu_model = Sequential() 31 | mu_model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 32 | mu_model.add(Dense(16)) 33 | mu_model.add(Activation('relu')) 34 | mu_model.add(Dense(nb_actions)) 35 | 36 | action_input = Input(shape=(nb_actions,), name='action_input') 37 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') 38 | x = concatenate([action_input, Flatten()(observation_input)]) 39 | x = Dense(16)(x) 40 | x = Activation('relu')(x) 41 | x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x) 42 | L_model = Model(input=[action_input, observation_input], output=x) 43 | 44 | memory = SequentialMemory(limit=1000, window_length=1) 45 | random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions) 46 | agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, 47 | memory=memory, nb_steps_warmup=50, random_process=random_process, 48 | gamma=.99, target_model_update=1e-3) 49 | agent.compile(Adam(lr=1e-3)) 50 | 51 | agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100) 52 | h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100) 53 | # TODO: evaluate history 54 | 55 | 56 | def test_ddpg(): 57 | # TODO: replace this with a simpler environment where we can actually test if it finds a solution 58 | env = gym.make('Pendulum-v0') 59 | np.random.seed(123) 60 | env.seed(123) 61 | random.seed(123) 62 | nb_actions = env.action_space.shape[0] 63 | 64 | actor = Sequential() 65 | actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 66 | actor.add(Dense(16)) 67 | actor.add(Activation('relu')) 68 | actor.add(Dense(nb_actions)) 69 | actor.add(Activation('linear')) 70 | 71 | action_input = Input(shape=(nb_actions,), name='action_input') 72 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') 73 | flattened_observation = Flatten()(observation_input) 74 | x = concatenate([action_input, flattened_observation]) 75 | x = Dense(16)(x) 76 | x = Activation('relu')(x) 77 | x = Dense(1)(x) 78 | x = Activation('linear')(x) 79 | critic = Model(input=[action_input, observation_input], output=x) 80 | 81 | memory = SequentialMemory(limit=1000, window_length=1) 82 | random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3) 83 | agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, 84 | memory=memory, nb_steps_warmup_critic=50, nb_steps_warmup_actor=50, 85 | random_process=random_process, gamma=.99, target_model_update=1e-3) 86 | agent.compile([Adam(lr=1e-3), Adam(lr=1e-3)]) 87 | 88 | agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100) 89 | h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100) 90 | # TODO: evaluate history 91 | -------------------------------------------------------------------------------- /rl/policy.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | 4 | from rl.util import * 5 | 6 | 7 | class Policy(object): 8 | def _set_agent(self, agent): 9 | self.agent = agent 10 | 11 | @property 12 | def metrics_names(self): 13 | return [] 14 | 15 | @property 16 | def metrics(self): 17 | return [] 18 | 19 | def select_action(self, **kwargs): 20 | raise NotImplementedError() 21 | 22 | def get_config(self): 23 | return {} 24 | 25 | 26 | class LinearAnnealedPolicy(Policy): 27 | def __init__(self, inner_policy, attr, value_max, value_min, value_test, nb_steps): 28 | if not hasattr(inner_policy, attr): 29 | raise ValueError('Policy "{}" does not have attribute "{}".'.format(attr)) 30 | 31 | super(LinearAnnealedPolicy, self).__init__() 32 | 33 | self.inner_policy = inner_policy 34 | self.attr = attr 35 | self.value_max = value_max 36 | self.value_min = value_min 37 | self.value_test = value_test 38 | self.nb_steps = nb_steps 39 | 40 | def get_current_value(self): 41 | if self.agent.training: 42 | # Linear annealed: f(x) = ax + b. 43 | a = -float(self.value_max - self.value_min) / float(self.nb_steps) 44 | b = float(self.value_max) 45 | value = max(self.value_min, a * float(self.agent.step) + b) 46 | else: 47 | value = self.value_test 48 | return value 49 | 50 | def select_action(self, **kwargs): 51 | setattr(self.inner_policy, self.attr, self.get_current_value()) 52 | return self.inner_policy.select_action(**kwargs) 53 | 54 | @property 55 | def metrics_names(self): 56 | return ['mean_{}'.format(self.attr)] 57 | 58 | @property 59 | def metrics(self): 60 | return [getattr(self.inner_policy, self.attr)] 61 | 62 | def get_config(self): 63 | config = super(LinearAnnealedPolicy, self).get_config() 64 | config['attr'] = self.attr 65 | config['value_max'] = self.value_max 66 | config['value_min'] = self.value_min 67 | config['value_test'] = self.value_test 68 | config['nb_steps'] = self.nb_steps 69 | config['inner_policy'] = get_object_config(self.inner_policy) 70 | return config 71 | 72 | 73 | class EpsGreedyQPolicy(Policy): 74 | def __init__(self, eps=.1): 75 | super(EpsGreedyQPolicy, self).__init__() 76 | self.eps = eps 77 | 78 | def select_action(self, q_values): 79 | assert q_values.ndim == 1 80 | nb_actions = q_values.shape[0] 81 | 82 | if np.random.uniform() < self.eps: 83 | action = np.random.random_integers(0, nb_actions-1) 84 | else: 85 | action = np.argmax(q_values) 86 | return action 87 | 88 | def get_config(self): 89 | config = super(EpsGreedyQPolicy, self).get_config() 90 | config['eps'] = self.eps 91 | return config 92 | 93 | 94 | class GreedyQPolicy(Policy): 95 | def select_action(self, q_values): 96 | assert q_values.ndim == 1 97 | action = np.argmax(q_values) 98 | return action 99 | 100 | 101 | class BoltzmannQPolicy(Policy): 102 | def __init__(self, tau=1., clip=(-500., 500.)): 103 | super(BoltzmannQPolicy, self).__init__() 104 | self.tau = tau 105 | self.clip = clip 106 | 107 | def select_action(self, q_values): 108 | assert q_values.ndim == 1 109 | q_values = q_values.astype('float64') 110 | nb_actions = q_values.shape[0] 111 | 112 | exp_values = np.exp(np.clip(q_values / self.tau, self.clip[0], self.clip[1])) 113 | probs = exp_values / np.sum(exp_values) 114 | action = np.random.choice(range(nb_actions), p=probs) 115 | return action 116 | 117 | def get_config(self): 118 | config = super(BoltzmannQPolicy, self).get_config() 119 | config['tau'] = self.tau 120 | config['clip'] = self.clip 121 | return config 122 | -------------------------------------------------------------------------------- /rl/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from keras.models import model_from_config, Sequential, Model, model_from_config 4 | import keras.optimizers as optimizers 5 | import keras.backend as K 6 | 7 | 8 | def clone_model(model, custom_objects={}): 9 | # Requires Keras 1.0.7 since get_config has breaking changes. 10 | config = { 11 | 'class_name': model.__class__.__name__, 12 | 'config': model.get_config(), 13 | } 14 | clone = model_from_config(config, custom_objects=custom_objects) 15 | clone.set_weights(model.get_weights()) 16 | return clone 17 | 18 | 19 | def clone_optimizer(optimizer): 20 | if type(optimizer) is str: 21 | return optimizers.get(optimizer) 22 | # Requires Keras 1.0.7 since get_config has breaking changes. 23 | params = dict([(k, v) for k, v in optimizer.get_config().items()]) 24 | config = { 25 | 'class_name': optimizer.__class__.__name__, 26 | 'config': params, 27 | } 28 | if hasattr(optimizers, 'optimizer_from_config'): 29 | # COMPATIBILITY: Keras < 2.0 30 | clone = optimizers.optimizer_from_config(config) 31 | else: 32 | clone = optimizers.deserialize(config) 33 | return clone 34 | 35 | 36 | def get_soft_target_model_updates(target, source, tau): 37 | target_weights = target.trainable_weights + sum([l.non_trainable_weights for l in target.layers], []) 38 | source_weights = source.trainable_weights + sum([l.non_trainable_weights for l in source.layers], []) 39 | assert len(target_weights) == len(source_weights) 40 | 41 | # Create updates. 42 | updates = [] 43 | for tw, sw in zip(target_weights, source_weights): 44 | updates.append((tw, tau * sw + (1. - tau) * tw)) 45 | return updates 46 | 47 | 48 | def get_object_config(o): 49 | if o is None: 50 | return None 51 | 52 | config = { 53 | 'class_name': o.__class__.__name__, 54 | 'config': o.get_config() 55 | } 56 | return config 57 | 58 | 59 | def huber_loss(y_true, y_pred, clip_value): 60 | # Huber loss, see https://en.wikipedia.org/wiki/Huber_loss and 61 | # https://medium.com/@karpathy/yes-you-should-understand-backprop-e2f06eab496b 62 | # for details. 63 | assert clip_value > 0. 64 | 65 | x = y_true - y_pred 66 | if np.isinf(clip_value): 67 | # Spacial case for infinity since Tensorflow does have problems 68 | # if we compare `K.abs(x) < np.inf`. 69 | return .5 * K.square(x) 70 | 71 | condition = K.abs(x) < clip_value 72 | squared_loss = .5 * K.square(x) 73 | linear_loss = clip_value * (K.abs(x) - .5 * clip_value) 74 | if K.backend() == 'tensorflow': 75 | import tensorflow as tf 76 | if hasattr(tf, 'select'): 77 | return tf.select(condition, squared_loss, linear_loss) # condition, true, false 78 | else: 79 | return tf.where(condition, squared_loss, linear_loss) # condition, true, false 80 | elif K.backend() == 'theano': 81 | from theano import tensor as T 82 | return T.switch(condition, squared_loss, linear_loss) 83 | else: 84 | raise RuntimeError('Unknown backend "{}".'.format(K.backend())) 85 | 86 | 87 | class AdditionalUpdatesOptimizer(optimizers.Optimizer): 88 | def __init__(self, optimizer, additional_updates): 89 | super(AdditionalUpdatesOptimizer, self).__init__() 90 | self.optimizer = optimizer 91 | self.additional_updates = additional_updates 92 | 93 | def get_updates(self, params, constraints, loss): 94 | updates = self.optimizer.get_updates(params, constraints, loss) 95 | updates += self.additional_updates 96 | self.updates = updates 97 | return self.updates 98 | 99 | def get_config(self): 100 | return self.optimizer.get_config() 101 | 102 | 103 | # Based on https://github.com/openai/baselines/blob/master/baselines/common/mpi_running_mean_std.py 104 | class WhiteningNormalizer(object): 105 | def __init__(self, shape, eps=1e-2, dtype=np.float64): 106 | self.eps = eps 107 | self.shape = shape 108 | self.dtype = dtype 109 | 110 | self._sum = np.zeros(shape, dtype=dtype) 111 | self._sumsq = np.zeros(shape, dtype=dtype) 112 | self._count = 0 113 | 114 | self.mean = np.zeros(shape, dtype=dtype) 115 | self.std = np.ones(shape, dtype=dtype) 116 | 117 | def normalize(self, x): 118 | return (x - self.mean) / self.std 119 | 120 | def denormalize(self, x): 121 | return self.std * x + self.mean 122 | 123 | def update(self, x): 124 | if x.ndim == len(self.shape): 125 | x = x.reshape(-1, *self.shape) 126 | assert x.shape[1:] == self.shape 127 | 128 | self._count += x.shape[0] 129 | self._sum += np.sum(x, axis=0) 130 | self._sumsq += np.sum(np.square(x), axis=0) 131 | 132 | self.mean = self._sum / float(self._count) 133 | self.std = np.sqrt(np.maximum(np.square(self.eps), self._sumsq / float(self._count) - np.square(self.mean))) 134 | -------------------------------------------------------------------------------- /tests/integration/test_discrete.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from numpy.testing import assert_allclose 5 | from gym.envs.debugging.two_round_deterministic_reward import TwoRoundDeterministicRewardEnv 6 | 7 | from keras.models import Sequential 8 | from keras.layers import Dense, Activation, Flatten 9 | from keras.optimizers import Adam 10 | from rl.agents import DQNAgent, CEMAgent, SARSAAgent 11 | from rl.policy import EpsGreedyQPolicy 12 | from rl.memory import SequentialMemory, EpisodeParameterMemory 13 | 14 | 15 | def test_dqn(): 16 | env = TwoRoundDeterministicRewardEnv() 17 | np.random.seed(123) 18 | env.seed(123) 19 | random.seed(123) 20 | nb_actions = env.action_space.n 21 | 22 | # Next, we build a very simple model. 23 | model = Sequential() 24 | model.add(Dense(16, input_shape=(1,))) 25 | model.add(Activation('relu')) 26 | model.add(Dense(nb_actions)) 27 | model.add(Activation('linear')) 28 | 29 | memory = SequentialMemory(limit=1000, window_length=1) 30 | policy = EpsGreedyQPolicy(eps=.1) 31 | dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, 32 | target_model_update=1e-1, policy=policy, enable_double_dqn=False) 33 | dqn.compile(Adam(lr=1e-3)) 34 | 35 | dqn.fit(env, nb_steps=2000, visualize=False, verbose=0) 36 | policy.eps = 0. 37 | h = dqn.test(env, nb_episodes=20, visualize=False) 38 | assert_allclose(np.mean(h.history['episode_reward']), 3.) 39 | 40 | 41 | def test_double_dqn(): 42 | env = TwoRoundDeterministicRewardEnv() 43 | np.random.seed(123) 44 | env.seed(123) 45 | random.seed(123) 46 | nb_actions = env.action_space.n 47 | 48 | # Next, we build a very simple model. 49 | model = Sequential() 50 | model.add(Dense(16, input_shape=(1,))) 51 | model.add(Activation('relu')) 52 | model.add(Dense(nb_actions)) 53 | model.add(Activation('linear')) 54 | 55 | memory = SequentialMemory(limit=1000, window_length=1) 56 | policy = EpsGreedyQPolicy(eps=.1) 57 | dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, 58 | target_model_update=1e-1, policy=policy, enable_double_dqn=True) 59 | dqn.compile(Adam(lr=1e-3)) 60 | 61 | dqn.fit(env, nb_steps=2000, visualize=False, verbose=0) 62 | policy.eps = 0. 63 | h = dqn.test(env, nb_episodes=20, visualize=False) 64 | assert_allclose(np.mean(h.history['episode_reward']), 3.) 65 | 66 | 67 | def test_cem(): 68 | env = TwoRoundDeterministicRewardEnv() 69 | np.random.seed(123) 70 | env.seed(123) 71 | random.seed(123) 72 | nb_actions = env.action_space.n 73 | 74 | # Next, we build a very simple model. 75 | model = Sequential() 76 | model.add(Dense(16, input_shape=(1,))) 77 | model.add(Activation('relu')) 78 | model.add(Dense(nb_actions)) 79 | model.add(Activation('linear')) 80 | 81 | memory = EpisodeParameterMemory(limit=1000, window_length=1) 82 | dqn = CEMAgent(model=model, nb_actions=nb_actions, memory=memory) 83 | dqn.compile() 84 | 85 | dqn.fit(env, nb_steps=2000, visualize=False, verbose=1) 86 | h = dqn.test(env, nb_episodes=20, visualize=False) 87 | assert_allclose(np.mean(h.history['episode_reward']), 3.) 88 | 89 | 90 | def test_duel_dqn(): 91 | env = TwoRoundDeterministicRewardEnv() 92 | np.random.seed(123) 93 | env.seed(123) 94 | random.seed(123) 95 | nb_actions = env.action_space.n 96 | 97 | # Next, we build a very simple model. 98 | model = Sequential() 99 | model.add(Dense(16, input_shape=(1,))) 100 | model.add(Activation('relu')) 101 | model.add(Dense(nb_actions, activation='linear')) 102 | 103 | memory = SequentialMemory(limit=1000, window_length=1) 104 | policy = EpsGreedyQPolicy(eps=.1) 105 | dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, 106 | target_model_update=1e-1, policy=policy, enable_double_dqn=False, enable_dueling_network=True) 107 | dqn.compile(Adam(lr=1e-3)) 108 | 109 | dqn.fit(env, nb_steps=2000, visualize=False, verbose=0) 110 | policy.eps = 0. 111 | h = dqn.test(env, nb_episodes=20, visualize=False) 112 | assert_allclose(np.mean(h.history['episode_reward']), 3.) 113 | 114 | 115 | def test_sarsa(): 116 | env = TwoRoundDeterministicRewardEnv() 117 | np.random.seed(123) 118 | env.seed(123) 119 | random.seed(123) 120 | nb_actions = env.action_space.n 121 | 122 | # Next, we build a very simple model. 123 | model = Sequential() 124 | model.add(Dense(16, input_shape=(1,))) 125 | model.add(Activation('relu')) 126 | model.add(Dense(nb_actions, activation='linear')) 127 | 128 | policy = EpsGreedyQPolicy(eps=.1) 129 | sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy) 130 | sarsa.compile(Adam(lr=1e-3)) 131 | 132 | sarsa.fit(env, nb_steps=20000, visualize=False, verbose=0) 133 | policy.eps = 0. 134 | h = sarsa.test(env, nb_episodes=20, visualize=False) 135 | assert_allclose(np.mean(h.history['episode_reward']), 3.) 136 | -------------------------------------------------------------------------------- /examples/dqn_atari.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import argparse 3 | 4 | from PIL import Image 5 | import numpy as np 6 | import gym 7 | 8 | from keras.models import Sequential 9 | from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute 10 | from keras.optimizers import Adam 11 | import keras.backend as K 12 | 13 | from rl.agents.dqn import DQNAgent 14 | from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy 15 | from rl.memory import SequentialMemory 16 | from rl.core import Processor 17 | from rl.callbacks import FileLogger, ModelIntervalCheckpoint 18 | 19 | 20 | INPUT_SHAPE = (84, 84) 21 | WINDOW_LENGTH = 4 22 | 23 | 24 | class AtariProcessor(Processor): 25 | def process_observation(self, observation): 26 | assert observation.ndim == 3 # (height, width, channel) 27 | img = Image.fromarray(observation) 28 | img = img.resize(INPUT_SHAPE).convert('L') # resize and convert to grayscale 29 | processed_observation = np.array(img) 30 | assert processed_observation.shape == INPUT_SHAPE 31 | return processed_observation.astype('uint8') # saves storage in experience memory 32 | 33 | def process_state_batch(self, batch): 34 | # We could perform this processing step in `process_observation`. In this case, however, 35 | # we would need to store a `float32` array instead, which is 4x more memory intensive than 36 | # an `uint8` array. This matters if we store 1M observations. 37 | processed_batch = batch.astype('float32') / 255. 38 | return processed_batch 39 | 40 | def process_reward(self, reward): 41 | return np.clip(reward, -1., 1.) 42 | 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument('--mode', choices=['train', 'test'], default='train') 45 | parser.add_argument('--env-name', type=str, default='BreakoutDeterministic-v4') 46 | parser.add_argument('--weights', type=str, default=None) 47 | args = parser.parse_args() 48 | 49 | # Get the environment and extract the number of actions. 50 | env = gym.make(args.env_name) 51 | np.random.seed(123) 52 | env.seed(123) 53 | nb_actions = env.action_space.n 54 | 55 | # Next, we build our model. We use the same model that was described by Mnih et al. (2015). 56 | input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE 57 | model = Sequential() 58 | if K.image_dim_ordering() == 'tf': 59 | # (width, height, channels) 60 | model.add(Permute((2, 3, 1), input_shape=input_shape)) 61 | elif K.image_dim_ordering() == 'th': 62 | # (channels, width, height) 63 | model.add(Permute((1, 2, 3), input_shape=input_shape)) 64 | else: 65 | raise RuntimeError('Unknown image_dim_ordering.') 66 | model.add(Convolution2D(32, 8, 8, subsample=(4, 4))) 67 | model.add(Activation('relu')) 68 | model.add(Convolution2D(64, 4, 4, subsample=(2, 2))) 69 | model.add(Activation('relu')) 70 | model.add(Convolution2D(64, 3, 3, subsample=(1, 1))) 71 | model.add(Activation('relu')) 72 | model.add(Flatten()) 73 | model.add(Dense(512)) 74 | model.add(Activation('relu')) 75 | model.add(Dense(nb_actions)) 76 | model.add(Activation('linear')) 77 | print(model.summary()) 78 | 79 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 80 | # even the metrics! 81 | memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) 82 | processor = AtariProcessor() 83 | 84 | # Select a policy. We use eps-greedy action selection, which means that a random action is selected 85 | # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that 86 | # the agent initially explores the environment (high eps) and then gradually sticks to what it knows 87 | # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 88 | # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. 89 | policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, 90 | nb_steps=1000000) 91 | 92 | # The trade-off between exploration and exploitation is difficult and an on-going research topic. 93 | # If you want, you can experiment with the parameters or use a different policy. Another popular one 94 | # is Boltzmann-style exploration: 95 | # policy = BoltzmannQPolicy(tau=1.) 96 | # Feel free to give it a try! 97 | 98 | dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, 99 | processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, 100 | train_interval=4, delta_clip=1.) 101 | dqn.compile(Adam(lr=.00025), metrics=['mae']) 102 | 103 | if args.mode == 'train': 104 | # Okay, now it's time to learn something! We capture the interrupt exception so that training 105 | # can be prematurely aborted. Notice that you can the built-in Keras callbacks! 106 | weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) 107 | checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' 108 | log_filename = 'dqn_{}_log.json'.format(args.env_name) 109 | callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)] 110 | callbacks += [FileLogger(log_filename, interval=100)] 111 | dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000) 112 | 113 | # After training is done, we save the final weights one more time. 114 | dqn.save_weights(weights_filename, overwrite=True) 115 | 116 | # Finally, evaluate our algorithm for 10 episodes. 117 | dqn.test(env, nb_episodes=10, visualize=False) 118 | elif args.mode == 'test': 119 | weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) 120 | if args.weights: 121 | weights_filename = args.weights 122 | dqn.load_weights(weights_filename) 123 | dqn.test(env, nb_episodes=10, visualize=True) 124 | -------------------------------------------------------------------------------- /docs/sources/agents/overview.md: -------------------------------------------------------------------------------- 1 | ## Available Agents 2 | 3 | | Name | Implementation | Observation Space | Action Space | 4 | | ---------------------- |------------------------| -------------------| ---------------| 5 | | [DQN](/agents/dqn) | `rl.agents.DQNAgent` | discrete or continuous | discrete | 6 | | [DDPG](/agents/ddpg) | `rl.agents.DDPGAgent` | discrete or continuous | continuous | 7 | | [NAF](/agents/naf) | `rl.agents.NAFAgent` | discrete or continuous | continuous | 8 | | [CEM](/agents/cem) | `rl.agents.CEMAgent` | discrete or continuous | discrete | 9 | | [SARSA](/agents/sarsa) | `rl.agents.SARSAAgent` | discrete or continuous | discrete | 10 | 11 | --- 12 | 13 | ## Common API 14 | 15 | All agents share a common API. This allows you to easily switch between different agents. 16 | That being said, keep in mind that some agents make assumptions regarding the action space, i.e. assume discrete 17 | or continuous actions. 18 | 19 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L44) 20 | 21 | ### fit 22 | 23 | 24 | ```python 25 | fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None) 26 | ``` 27 | 28 | 29 | Trains the agent on the given environment. 30 | 31 | __Arguments__ 32 | 33 | - __env:__ (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. 34 | - __nb_steps__ (integer): Number of training steps to be performed. 35 | - __action_repetition__ (integer): Number of times the agent repeats the same action without 36 | observing the environment again. Setting this to a value > 1 can be useful 37 | if a single action only has a very small effect on the environment. 38 | - __callbacks__ (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): 39 | List of callbacks to apply during training. See [callbacks](/callbacks) for details. 40 | - __verbose__ (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging 41 | - __visualize__ (boolean): If `True`, the environment is visualized during training. However, 42 | this is likely going to slow down training significantly and is thus intended to be 43 | a debugging instrument. 44 | - __nb_max_start_steps__ (integer): Number of maximum steps that the agent performs at the beginning 45 | of each episode using `start_step_policy`. Notice that this is an upper limit since 46 | the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] 47 | at the beginning of each episode. 48 | - __start_step_policy__ (`lambda observation: action`): The policy 49 | to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. 50 | - __log_interval__ (integer): If `verbose` = 1, the number of steps that are considered to be an interval. 51 | - __nb_max_episode_steps__ (integer): Number of steps per episode that the agent performs before 52 | automatically resetting the environment. Set to `None` if each episode should run 53 | (potentially indefinitely) until the environment signals a terminal state. 54 | 55 | __Returns__ 56 | 57 | A `keras.callbacks.History` instance that recorded the entire training process. 58 | 59 | ---- 60 | 61 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L231) 62 | 63 | ### test 64 | 65 | 66 | ```python 67 | test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1) 68 | ``` 69 | 70 | 71 | Callback that is called before training begins." 72 | 73 | ---- 74 | 75 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L391) 76 | 77 | ### compile 78 | 79 | 80 | ```python 81 | compile(self, optimizer, metrics=[]) 82 | ``` 83 | 84 | 85 | Compiles an agent and the underlaying models to be used for training and testing. 86 | 87 | __Arguments__ 88 | 89 | - __optimizer__ (`keras.optimizers.Optimizer` instance): The optimizer to be used during training. 90 | - __metrics__ (list of functions `lambda y_true, y_pred: metric`): The metrics to run during training. 91 | 92 | ---- 93 | 94 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L39) 95 | 96 | ### get_config 97 | 98 | 99 | ```python 100 | get_config(self) 101 | ``` 102 | 103 | 104 | Configuration of the agent for serialization. 105 | 106 | ---- 107 | 108 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L364) 109 | 110 | ### reset_states 111 | 112 | 113 | ```python 114 | reset_states(self) 115 | ``` 116 | 117 | 118 | Resets all internally kept states after an episode is completed. 119 | 120 | ---- 121 | 122 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L400) 123 | 124 | ### load_weights 125 | 126 | 127 | ```python 128 | load_weights(self, filepath) 129 | ``` 130 | 131 | 132 | Loads the weights of an agent from an HDF5 file. 133 | 134 | __Arguments__ 135 | 136 | - __filepath__ (str): The path to the HDF5 file. 137 | 138 | ---- 139 | 140 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L408) 141 | 142 | ### save_weights 143 | 144 | 145 | ```python 146 | save_weights(self, filepath, overwrite=False) 147 | ``` 148 | 149 | 150 | Saves the weights of an agent as an HDF5 file. 151 | 152 | __Arguments__ 153 | 154 | - __filepath__ (str): The path to where the weights should be saved. 155 | - __overwrite__ (boolean): If `False` and `filepath` already exists, raises an error. 156 | 157 | -------------------------------------------------------------------------------- /tests/rl/test_core.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pytest 3 | import numpy as np 4 | from numpy.testing import assert_allclose 5 | 6 | from rl.memory import SequentialMemory 7 | from rl.core import Agent, Env, Processor 8 | 9 | 10 | class TestEnv(Env): 11 | def __init__(self): 12 | super(TestEnv, self).__init__() 13 | 14 | def step(self, action): 15 | self.state += 1 16 | done = self.state >= 6 17 | reward = float(self.state) / 10. 18 | return np.array(self.state), reward, done, {} 19 | 20 | def reset(self): 21 | self.state = 1 22 | return np.array(self.state) 23 | 24 | def seed(self, seed=None): 25 | pass 26 | 27 | def configure(self, *args, **kwargs): 28 | pass 29 | 30 | 31 | class TestAgent(Agent): 32 | def __init__(self, memory, **kwargs): 33 | super(TestAgent, self).__init__(**kwargs) 34 | self.memory = memory 35 | 36 | def forward(self, observation): 37 | action = observation 38 | self.recent_action = action 39 | self.recent_observation = observation 40 | return action 41 | 42 | def backward(self, reward, terminal): 43 | metrics = [np.nan for _ in self.metrics_names] 44 | self.memory.append(self.recent_observation, self.recent_action, reward, terminal) 45 | return metrics 46 | 47 | def compile(self): 48 | self.compiled = True 49 | 50 | 51 | def test_fit_observations(): 52 | memory = SequentialMemory(100, window_length=2, ignore_episode_boundaries=False) 53 | agent = TestAgent(memory) 54 | env = TestEnv() 55 | agent.compile() 56 | agent.fit(env, 20, verbose=0) 57 | 58 | # Inspect memory to see if observations are correct. 59 | experiencies = memory.sample(batch_size=8, batch_idxs=range(8)) 60 | 61 | assert experiencies[0].reward == .2 62 | assert experiencies[0].action == 1 63 | assert_allclose(experiencies[0].state0, np.array([0, 1])) 64 | assert_allclose(experiencies[0].state1, np.array([1, 2])) 65 | assert experiencies[0].terminal1 is False 66 | 67 | assert experiencies[1].reward == .3 68 | assert experiencies[1].action == 2 69 | assert_allclose(experiencies[1].state0, np.array([1, 2])) 70 | assert_allclose(experiencies[1].state1, np.array([2, 3])) 71 | assert experiencies[1].terminal1 is False 72 | 73 | assert experiencies[2].reward == .4 74 | assert experiencies[2].action == 3 75 | assert_allclose(experiencies[2].state0, np.array([2, 3])) 76 | assert_allclose(experiencies[2].state1, np.array([3, 4])) 77 | assert experiencies[2].terminal1 is False 78 | 79 | assert experiencies[3].reward == .5 80 | assert experiencies[3].action == 4 81 | assert_allclose(experiencies[3].state0, np.array([3, 4])) 82 | assert_allclose(experiencies[3].state1, np.array([4, 5])) 83 | assert experiencies[3].terminal1 is False 84 | 85 | assert experiencies[4].reward == .6 86 | assert experiencies[4].action == 5 87 | assert_allclose(experiencies[4].state0, np.array([4, 5])) 88 | assert_allclose(experiencies[4].state1, np.array([5, 6])) 89 | assert experiencies[4].terminal1 is True 90 | 91 | # Experience 5 has been re-sampled since since state0 would be terminal in which case we 92 | # cannot really have a meaningful transition because the environment gets reset. We thus 93 | # just ensure that state0 is not terminal. 94 | assert not np.all(experiencies[5].state0 == np.array([5, 6])) 95 | 96 | assert experiencies[6].reward == .2 97 | assert experiencies[6].action == 1 98 | assert_allclose(experiencies[6].state0, np.array([0, 1])) 99 | assert_allclose(experiencies[6].state1, np.array([1, 2])) 100 | assert experiencies[6].terminal1 is False 101 | 102 | assert experiencies[7].reward == .3 103 | assert experiencies[7].action == 2 104 | assert_allclose(experiencies[7].state0, np.array([1, 2])) 105 | assert_allclose(experiencies[7].state1, np.array([2, 3])) 106 | assert experiencies[7].terminal1 is False 107 | 108 | 109 | def test_copy_observations(): 110 | methods = [ 111 | 'fit', 112 | 'test', 113 | ] 114 | 115 | for method in methods: 116 | original_observations = [] 117 | 118 | class LocalEnv(Env): 119 | def __init__(self): 120 | super(LocalEnv, self).__init__() 121 | 122 | def step(self, action): 123 | self.state += 1 124 | done = self.state >= 6 125 | reward = float(self.state) / 10. 126 | obs = np.array(self.state) 127 | original_observations.append(obs) 128 | return obs, reward, done, {} 129 | 130 | def reset(self): 131 | self.state = 1 132 | return np.array(self.state) 133 | 134 | def seed(self, seed=None): 135 | pass 136 | 137 | def configure(self, *args, **kwargs): 138 | pass 139 | 140 | # Slight abuse of the processor for test purposes. 141 | observations = [] 142 | 143 | class LocalProcessor(Processor): 144 | def process_step(self, observation, reward, done, info): 145 | observations.append(observation) 146 | return observation, reward, done, info 147 | 148 | processor = LocalProcessor() 149 | memory = SequentialMemory(100, window_length=1) 150 | agent = TestAgent(memory, processor=processor) 151 | env = LocalEnv() 152 | agent.compile() 153 | getattr(agent, method)(env, 20, verbose=0, visualize=False) 154 | 155 | assert len(observations) == len(original_observations) 156 | assert_allclose(np.array(observations), np.array(original_observations)) 157 | assert np.all([o is not o_ for o, o_ in zip(original_observations, observations)]) 158 | 159 | 160 | if __name__ == '__main__': 161 | pytest.main([__file__]) 162 | -------------------------------------------------------------------------------- /rl/agents/cem.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import deque 3 | from copy import deepcopy 4 | 5 | import numpy as np 6 | import keras.backend as K 7 | from keras.models import Model 8 | 9 | from rl.core import Agent 10 | from rl.util import * 11 | 12 | class CEMAgent(Agent): 13 | """Write me 14 | """ 15 | def __init__(self, model, nb_actions, memory, batch_size=50, nb_steps_warmup=1000, 16 | train_interval=50, elite_frac=0.05, memory_interval=1, theta_init=None, 17 | noise_decay_const=0.0, noise_ampl=0.0, **kwargs): 18 | super(CEMAgent, self).__init__(**kwargs) 19 | 20 | # Parameters. 21 | self.nb_actions = nb_actions 22 | self.batch_size = batch_size 23 | self.elite_frac = elite_frac 24 | self.num_best = int(self.batch_size * self.elite_frac) 25 | self.nb_steps_warmup = nb_steps_warmup 26 | self.train_interval = train_interval 27 | self.memory_interval = memory_interval 28 | 29 | # if using noisy CEM, the minimum standard deviation will be ampl * exp (- decay_const * step ) 30 | self.noise_decay_const = noise_decay_const 31 | self.noise_ampl = noise_ampl 32 | 33 | # default initial mean & cov, override this by passing an theta_init argument 34 | self.init_mean = 0.0 35 | self.init_stdev = 1.0 36 | 37 | # Related objects. 38 | self.memory = memory 39 | self.model = model 40 | self.shapes = [w.shape for w in model.get_weights()] 41 | self.sizes = [w.size for w in model.get_weights()] 42 | self.num_weights = sum(self.sizes) 43 | 44 | # store the best result seen during training, as a tuple (reward, flat_weights) 45 | self.best_seen = (-np.inf, np.zeros(self.num_weights)) 46 | 47 | self.theta = np.zeros(self.num_weights*2) 48 | self.update_theta(theta_init) 49 | 50 | # State. 51 | self.episode = 0 52 | self.compiled = False 53 | self.reset_states() 54 | 55 | def compile(self): 56 | self.model.compile(optimizer='sgd', loss='mse') 57 | self.compiled = True 58 | 59 | def load_weights(self, filepath): 60 | self.model.load_weights(filepath) 61 | 62 | def save_weights(self, filepath, overwrite=False): 63 | self.model.save_weights(filepath, overwrite=overwrite) 64 | 65 | def get_weights_flat(self,weights): 66 | weights_flat = np.zeros(self.num_weights) 67 | 68 | pos = 0 69 | for i_layer, size in enumerate(self.sizes): 70 | weights_flat[pos:pos+size] = weights[i_layer].flatten() 71 | pos += size 72 | return weights_flat 73 | 74 | def get_weights_list(self,weights_flat): 75 | weights = [] 76 | pos = 0 77 | for i_layer, size in enumerate(self.sizes): 78 | arr = weights_flat[pos:pos+size].reshape(self.shapes[i_layer]) 79 | weights.append(arr) 80 | pos += size 81 | return weights 82 | 83 | def reset_states(self): 84 | self.recent_observation = None 85 | self.recent_action = None 86 | 87 | def select_action(self, state, stochastic=False): 88 | batch = np.array([state]) 89 | if self.processor is not None: 90 | batch = self.processor.process_state_batch(batch) 91 | 92 | action = self.model.predict_on_batch(batch).flatten() 93 | if stochastic or self.training: 94 | return np.random.choice(np.arange(self.nb_actions), p=np.exp(action) / np.sum(np.exp(action))) 95 | return np.argmax(action) 96 | 97 | def update_theta(self,theta): 98 | if (theta is not None): 99 | assert theta.shape == self.theta.shape, "Invalid theta, shape is {0} but should be {1}".format(theta.shape,self.theta.shape) 100 | assert (not np.isnan(theta).any()), "Invalid theta, NaN encountered" 101 | assert (theta[self.num_weights:] >= 0.).all(), "Invalid theta, standard deviations must be nonnegative" 102 | self.theta = theta 103 | else: 104 | means = np.ones(self.num_weights) * self.init_mean 105 | stdevs = np.ones(self.num_weights) * self.init_stdev 106 | self.theta = np.hstack((means,stdevs)) 107 | 108 | def choose_weights(self): 109 | mean = self.theta[:self.num_weights] 110 | std = self.theta[self.num_weights:] 111 | weights_flat = std * np.random.randn(self.num_weights) + mean 112 | 113 | sampled_weights = self.get_weights_list(weights_flat) 114 | self.model.set_weights(sampled_weights) 115 | 116 | def forward(self, observation): 117 | # Select an action. 118 | state = self.memory.get_recent_state(observation) 119 | action = self.select_action(state) 120 | if self.processor is not None: 121 | action = self.processor.process_action(action) 122 | 123 | # Book-keeping. 124 | self.recent_observation = observation 125 | self.recent_action = action 126 | 127 | return action 128 | 129 | @property 130 | def layers(self): 131 | return self.model.layers[:] 132 | 133 | def backward(self, reward, terminal): 134 | # Store most recent experience in memory. 135 | if self.step % self.memory_interval == 0: 136 | self.memory.append(self.recent_observation, self.recent_action, reward, terminal, 137 | training=self.training) 138 | 139 | metrics = [np.nan for _ in self.metrics_names] 140 | if not self.training: 141 | # We're done here. No need to update the experience memory since we only use the working 142 | # memory to obtain the state over the most recent observations. 143 | return metrics 144 | 145 | if terminal: 146 | params = self.get_weights_flat(self.model.get_weights()) 147 | self.memory.finalize_episode(params) 148 | 149 | if self.step > self.nb_steps_warmup and self.episode % self.train_interval == 0: 150 | params, reward_totals = self.memory.sample(self.batch_size) 151 | best_idx = np.argsort(np.array(reward_totals))[-self.num_best:] 152 | best = np.vstack([params[i] for i in best_idx]) 153 | 154 | if reward_totals[best_idx[-1]] > self.best_seen[0]: 155 | self.best_seen = (reward_totals[best_idx[-1]], params[best_idx[-1]]) 156 | 157 | metrics = [np.mean(np.array(reward_totals)[best_idx])] 158 | if self.processor is not None: 159 | metrics += self.processor.metrics 160 | min_std = self.noise_ampl * np.exp(-self.step * self.noise_decay_const) 161 | 162 | mean = np.mean(best, axis=0) 163 | std = np.std(best, axis=0) + min_std 164 | new_theta = np.hstack((mean, std)) 165 | self.update_theta(new_theta) 166 | self.choose_weights() 167 | self.episode += 1 168 | return metrics 169 | 170 | def _on_train_end(self): 171 | self.model.set_weights(self.get_weights_list(self.best_seen[1])) 172 | 173 | @property 174 | def metrics_names(self): 175 | names = ['mean_best_reward'] 176 | if self.processor is not None: 177 | names += self.processor.metrics_names[:] 178 | return names 179 | -------------------------------------------------------------------------------- /tests/rl/agents/test_dqn.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import absolute_import 3 | 4 | import pytest 5 | import numpy as np 6 | from numpy.testing import assert_allclose 7 | 8 | from keras.models import Sequential 9 | from keras.layers import Input, merge, Dense, Flatten 10 | 11 | from rl.agents.dqn import NAFLayer, DQNAgent, NAFAgent 12 | from rl.memory import SequentialMemory 13 | from rl.processors import MultiInputProcessor 14 | from rl.keras_future import concatenate, Model 15 | 16 | from ..util import MultiInputTestEnv 17 | 18 | 19 | def test_single_dqn_input(): 20 | model = Sequential() 21 | model.add(Flatten(input_shape=(2, 3))) 22 | model.add(Dense(2)) 23 | 24 | memory = SequentialMemory(limit=10, window_length=2) 25 | for double_dqn in (True, False): 26 | agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, 27 | enable_double_dqn=double_dqn) 28 | agent.compile('sgd') 29 | agent.fit(MultiInputTestEnv((3,)), nb_steps=10) 30 | 31 | 32 | def test_multi_dqn_input(): 33 | input1 = Input(shape=(2, 3)) 34 | input2 = Input(shape=(2, 4)) 35 | x = merge([input1, input2], mode='concat') 36 | x = Flatten()(x) 37 | x = Dense(2)(x) 38 | model = Model(input=[input1, input2], output=x) 39 | 40 | memory = SequentialMemory(limit=10, window_length=2) 41 | processor = MultiInputProcessor(nb_inputs=2) 42 | for double_dqn in (True, False): 43 | agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, 44 | processor=processor, enable_double_dqn=double_dqn) 45 | agent.compile('sgd') 46 | agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10) 47 | 48 | 49 | def test_single_continuous_dqn_input(): 50 | nb_actions = 2 51 | 52 | V_model = Sequential() 53 | V_model.add(Flatten(input_shape=(2, 3))) 54 | V_model.add(Dense(1)) 55 | 56 | mu_model = Sequential() 57 | mu_model.add(Flatten(input_shape=(2, 3))) 58 | mu_model.add(Dense(nb_actions)) 59 | 60 | L_input = Input(shape=(2, 3)) 61 | L_input_action = Input(shape=(nb_actions,)) 62 | x = concatenate([Flatten()(L_input), L_input_action]) 63 | x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x) 64 | L_model = Model(input=[L_input_action, L_input], output=x) 65 | 66 | memory = SequentialMemory(limit=10, window_length=2) 67 | agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, 68 | memory=memory, nb_steps_warmup=5, batch_size=4) 69 | agent.compile('sgd') 70 | agent.fit(MultiInputTestEnv((3,)), nb_steps=10) 71 | 72 | 73 | def test_multi_continuous_dqn_input(): 74 | nb_actions = 2 75 | 76 | V_input1 = Input(shape=(2, 3)) 77 | V_input2 = Input(shape=(2, 4)) 78 | x = concatenate([V_input1, V_input2]) 79 | x = Flatten()(x) 80 | x = Dense(1)(x) 81 | V_model = Model(input=[V_input1, V_input2], output=x) 82 | 83 | mu_input1 = Input(shape=(2, 3)) 84 | mu_input2 = Input(shape=(2, 4)) 85 | x = concatenate([mu_input1, mu_input2]) 86 | x = Flatten()(x) 87 | x = Dense(nb_actions)(x) 88 | mu_model = Model(input=[mu_input1, mu_input2], output=x) 89 | 90 | L_input1 = Input(shape=(2, 3)) 91 | L_input2 = Input(shape=(2, 4)) 92 | L_input_action = Input(shape=(nb_actions,)) 93 | x = concatenate([L_input1, L_input2]) 94 | x = concatenate([Flatten()(x), L_input_action]) 95 | x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x) 96 | L_model = Model(input=[L_input_action, L_input1, L_input2], output=x) 97 | 98 | memory = SequentialMemory(limit=10, window_length=2) 99 | processor = MultiInputProcessor(nb_inputs=2) 100 | agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, 101 | memory=memory, nb_steps_warmup=5, batch_size=4, processor=processor) 102 | agent.compile('sgd') 103 | agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10) 104 | 105 | 106 | def test_naf_layer_full(): 107 | batch_size = 2 108 | for nb_actions in (1, 3): 109 | # Construct single model with NAF as the only layer, hence it is fully deterministic 110 | # since no weights are used, which would be randomly initialized. 111 | L_flat_input = Input(shape=((nb_actions * nb_actions + nb_actions) // 2,)) 112 | mu_input = Input(shape=(nb_actions,)) 113 | action_input = Input(shape=(nb_actions,)) 114 | x = NAFLayer(nb_actions, mode='full')([L_flat_input, mu_input, action_input]) 115 | model = Model(input=[L_flat_input, mu_input, action_input], output=x) 116 | model.compile(loss='mse', optimizer='sgd') 117 | 118 | # Create random test data. 119 | L_flat = np.random.random((batch_size, (nb_actions * nb_actions + nb_actions) // 2)).astype('float32') 120 | mu = np.random.random((batch_size, nb_actions)).astype('float32') 121 | action = np.random.random((batch_size, nb_actions)).astype('float32') 122 | 123 | # Perform reference computations in numpy since these are much easier to verify. 124 | L = np.zeros((batch_size, nb_actions, nb_actions)).astype('float32') 125 | LT = np.copy(L) 126 | for l, l_T, l_flat in zip(L, LT, L_flat): 127 | l[np.tril_indices(nb_actions)] = l_flat 128 | l[np.diag_indices(nb_actions)] = np.exp(l[np.diag_indices(nb_actions)]) 129 | l_T[:, :] = l.T 130 | P = np.array([np.dot(l, l_T) for l, l_T in zip(L, LT)]).astype('float32') 131 | A_ref = np.array([np.dot(np.dot(a - m, p), a - m) for a, m, p in zip(action, mu, P)]).astype('float32') 132 | A_ref *= -.5 133 | 134 | # Finally, compute the output of the net, which should be identical to the previously 135 | # computed reference. 136 | A_net = model.predict([L_flat, mu, action]).flatten() 137 | assert_allclose(A_net, A_ref, rtol=1e-5) 138 | 139 | 140 | def test_naf_layer_diag(): 141 | batch_size = 2 142 | for nb_actions in (1, 3): 143 | # Construct single model with NAF as the only layer, hence it is fully deterministic 144 | # since no weights are used, which would be randomly initialized. 145 | L_flat_input = Input(shape=(nb_actions,)) 146 | mu_input = Input(shape=(nb_actions,)) 147 | action_input = Input(shape=(nb_actions,)) 148 | x = NAFLayer(nb_actions, mode='diag')([L_flat_input, mu_input, action_input]) 149 | model = Model(input=[L_flat_input, mu_input, action_input], output=x) 150 | model.compile(loss='mse', optimizer='sgd') 151 | 152 | # Create random test data. 153 | L_flat = np.random.random((batch_size, nb_actions)).astype('float32') 154 | mu = np.random.random((batch_size, nb_actions)).astype('float32') 155 | action = np.random.random((batch_size, nb_actions)).astype('float32') 156 | 157 | # Perform reference computations in numpy since these are much easier to verify. 158 | P = np.zeros((batch_size, nb_actions, nb_actions)).astype('float32') 159 | for p, l_flat in zip(P, L_flat): 160 | p[np.diag_indices(nb_actions)] = l_flat 161 | print(P, L_flat) 162 | A_ref = np.array([np.dot(np.dot(a - m, p), a - m) for a, m, p in zip(action, mu, P)]).astype('float32') 163 | A_ref *= -.5 164 | 165 | # Finally, compute the output of the net, which should be identical to the previously 166 | # computed reference. 167 | A_net = model.predict([L_flat, mu, action]).flatten() 168 | assert_allclose(A_net, A_ref, rtol=1e-5) 169 | 170 | 171 | if __name__ == '__main__': 172 | pytest.main([__file__]) 173 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Reinforcement Learning for Keras 2 | [![Build Status](https://api.travis-ci.org/matthiasplappert/keras-rl.svg?branch=master)](https://travis-ci.org/matthiasplappert/keras-rl) 3 | [![Documentation](https://readthedocs.org/projects/keras-rl/badge/)](http://keras-rl.readthedocs.io/) 4 | [![License](https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000)](https://github.com/matthiasplappert/keras-rl/blob/master/LICENSE) 5 | [![Join the chat at https://gitter.im/keras-rl/Lobby](https://badges.gitter.im/keras-rl/Lobby.svg)](https://gitter.im/keras-rl/Lobby) 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 | ## What is it? 17 | `keras-rl` implements some state-of-the art deep reinforcement learning algorithms in Python and seamlessly integrates with the deep learning library [Keras](http://keras.io). Just like Keras, it works with either [Theano](http://deeplearning.net/software/theano/) or [TensorFlow](https://www.tensorflow.org/), which means that you can train your algorithm efficiently either on CPU or GPU. 18 | Furthermore, `keras-rl` works with [OpenAI Gym](https://gym.openai.com/) out of the box. This means that evaluating and playing around with different algorithms is easy. 19 | Of course you can extend `keras-rl` according to your own needs. You can use built-in Keras callbacks and metrics or define your own. 20 | Even more so, it is easy to implement your own environments and even algorithms by simply extending some simple abstract classes. 21 | 22 | In a nutshell: `keras-rl` makes it really easy to run state-of-the-art deep reinforcement learning algorithms, uses Keras and thus Theano or TensorFlow and was built with OpenAI Gym in mind. 23 | 24 | ## What is included? 25 | As of today, the following algorithms have been implemented: 26 | 27 | - Deep Q Learning (DQN) [[1]](http://arxiv.org/abs/1312.5602), [[2]](http://home.uchicago.edu/~arij/journalclub/papers/2015_Mnih_et_al.pdf) 28 | - Double DQN [[3]](http://arxiv.org/abs/1509.06461) 29 | - Deep Deterministic Policy Gradient (DDPG) [[4]](http://arxiv.org/abs/1509.02971) 30 | - Continuous DQN (CDQN or NAF) [[6]](http://arxiv.org/abs/1603.00748) 31 | - Cross-Entropy Method (CEM) [[7]](http://learning.mpi-sws.org/mlss2016/slides/2016-MLSS-RL.pdf), [[8]](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.81.6579&rep=rep1&type=pdf) 32 | - Dueling network DQN (Dueling DQN) [[9]](https://arxiv.org/abs/1511.06581) 33 | - Deep SARSA [[10]](http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf) 34 | 35 | You can find more information on each agent in the [wiki](https://github.com/matthiasplappert/keras-rl/wiki/Agent-Overview). 36 | 37 | I'm currently working on the following algorithms, which can be found on the `experimental` branch: 38 | 39 | - Asynchronous Advantage Actor-Critic (A3C) [[5]](http://arxiv.org/abs/1602.01783) 40 | 41 | Notice that these are **only experimental** and might currently not even run. 42 | 43 | ## How do I install it and how do I get started? 44 | Installing `keras-rl` is easy. Just run the following commands and you should be good to go: 45 | ```bash 46 | pip install keras-rl 47 | ``` 48 | This will install `keras-rl` and all necessary dependencies. 49 | 50 | If you want to run the examples, you'll also have to install `gym` by OpenAI. 51 | Please refer to [their installation instructions](https://github.com/openai/gym#installation). 52 | It's quite easy and works nicely on Ubuntu and Mac OS X. 53 | You'll also need the `h5py` package to load and save model weights, which can be installed using 54 | the following command: 55 | ```bash 56 | pip install h5py 57 | ``` 58 | 59 | Once you have installed everything, you can try out a simple example: 60 | ```bash 61 | python examples/dqn_cartpole.py 62 | ``` 63 | This is a very simple example and it should converge relatively quickly, so it's a great way to get started! 64 | It also visualizes the game during training, so you can watch it learn. How cool is that? 65 | 66 | Unfortunately, the documentation of `keras-rl` is currently almost non-existent. 67 | However, you can find a couple of more examples that illustrate the usage of both DQN (for tasks with discrete actions) as well as for DDPG (for tasks with continuous actions). 68 | While these examples are not replacement for a proper documentation, they should be enough to get started quickly and to see the magic of reinforcement learning yourself. 69 | I also encourage you to play around with other environments (OpenAI Gym has plenty) and maybe even try to find better hyperparameters for the existing ones. 70 | 71 | If you have questions or problems, please file an issue or, even better, fix the problem yourself and submit a pull request! 72 | 73 | ## Do I have to train the models myself? 74 | Training times can be very long depending on the complexity of the environment. 75 | [This repo](https://github.com/matthiasplappert/keras-rl-weights) provides some weights that were obtained by running (at least some) of the examples that are included in `keras-rl`. 76 | You can load the weights using the `load_weights` method on the respective agents. 77 | 78 | ## Requirements 79 | - Python 2.7 or Python 3.5 80 | - [Keras](http://keras.io) >= 1.0.7 81 | 82 | That's it. However, if you want to run the examples, you'll also need the following dependencies: 83 | - [OpenAI Gym](https://github.com/openai/gym) 84 | - [h5py](https://pypi.python.org/pypi/h5py) 85 | 86 | `keras-rl` also works with [TensorFlow](https://www.tensorflow.org/). To find out how to use TensorFlow instead of [Theano](http://deeplearning.net/software/theano/), please refer to the [Keras documentation](http://keras.io/#switching-from-theano-to-tensorflow). 87 | 88 | ## Documentation 89 | We are currently in the process of getting a proper documentation going. [The latest version of the 90 | documentation is available online](http://keras-rl.readthedocs.org). All contributions to the 91 | documentation are greatly appreciated! 92 | 93 | ## Support 94 | You can ask questions and join the development discussion: 95 | 96 | - On the [Keras-RL Google group](https://groups.google.com/forum/#!forum/keras-rl-users). 97 | - On the [Keras-RL Gitter channel](https://gitter.im/keras-rl/Lobby). 98 | 99 | You can also post **bug reports and feature requests** (only!) in [Github issues](https://github.com/matthiasplappert/keras-rl/issues). 100 | 101 | ## Running the Tests 102 | To run the tests locally, you'll first have to install the following dependencies: 103 | ```bash 104 | pip install pytest pytest-xdist pep8 pytest-pep8 pytest-cov python-coveralls 105 | ``` 106 | You can then run all tests using this command: 107 | ```bash 108 | py.test tests/. 109 | ``` 110 | If you want to check if the files conform to the PEP8 style guidelines, run the following command: 111 | ```bash 112 | py.test --pep8 113 | ``` 114 | 115 | ## Citing 116 | If you use `keras-rl` in your research, you can cite it as follows: 117 | ```bibtex 118 | @misc{plappert2016kerasrl, 119 | author = {Matthias Plappert}, 120 | title = {keras-rl}, 121 | year = {2016}, 122 | publisher = {GitHub}, 123 | journal = {GitHub repository}, 124 | howpublished = {\url{https://github.com/matthiasplappert/keras-rl}}, 125 | } 126 | ``` 127 | 128 | 129 | ## Acknowledgments 130 | The foundation for this library was developed during my work at the [High Performance Humanoid Technologies (H²T)](https://h2t.anthropomatik.kit.edu/) lab at the [Karlsruhe Institute of Technology (KIT)](https://kit.edu). 131 | It has since been adapted to become a general-purpose library. 132 | 133 | ## References 134 | 1. *Playing Atari with Deep Reinforcement Learning*, Mnih et al., 2013 135 | 2. *Human-level control through deep reinforcement learning*, Mnih et al., 2015 136 | 3. *Deep Reinforcement Learning with Double Q-learning*, van Hasselt et al., 2015 137 | 4. *Continuous control with deep reinforcement learning*, Lillicrap et al., 2015 138 | 5. *Asynchronous Methods for Deep Reinforcement Learning*, Mnih et al., 2016 139 | 6. *Continuous Deep Q-Learning with Model-based Acceleration*, Gu et al., 2016 140 | 7. *Learning Tetris Using the Noisy Cross-Entropy Method*, Szita et al., 2006 141 | 8. *Deep Reinforcement Learning (MLSS lecture notes)*, Schulman, 2016 142 | 9. *Dueling Network Architectures for Deep Reinforcement Learning*, Wang et al., 2016 143 | 10. *Reinforcement learning: An introduction*, Sutton and Barto, 2011 144 | 145 | ## Todos 146 | - Documentation: Work on the documentation has begun but not everything is documented in code yet. Additionally, it would be super nice to have guides for each agents that describe the basic ideas behind it. 147 | - TRPO, priority-based memory, A3C, async DQN, ... 148 | -------------------------------------------------------------------------------- /docs/autogen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ This code and the entire documentation setup was adopted from the Keras repository: 3 | https://github.com/fchollet/keras/blob/master/docs/autogen.py 4 | """ 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | import re 9 | import inspect 10 | import os 11 | import shutil 12 | import sys 13 | if sys.version[0] == '2': 14 | reload(sys) 15 | sys.setdefaultencoding('utf8') 16 | 17 | import rl 18 | import rl.core 19 | import rl.processors 20 | import rl.agents 21 | 22 | 23 | EXCLUDE = { 24 | 25 | } 26 | 27 | PAGES = [ 28 | { 29 | 'page': 'core.md', 30 | 'all_module_classes': [rl.core], 31 | }, 32 | { 33 | 'page': 'processors.md', 34 | 'all_module_classes': [rl.processors], 35 | }, 36 | { 37 | 'page': 'agents/overview.md', 38 | 'functions': [ 39 | rl.core.Agent.fit, 40 | rl.core.Agent.test, 41 | rl.core.Agent.compile, 42 | rl.core.Agent.get_config, 43 | rl.core.Agent.reset_states, 44 | rl.core.Agent.load_weights, 45 | rl.core.Agent.save_weights, 46 | ], 47 | }, 48 | { 49 | 'page': 'agents/dqn.md', 50 | 'classes': [rl.agents.DQNAgent], 51 | }, 52 | { 53 | 'page': 'agents/naf.md', 54 | 'classes': [rl.agents.NAFAgent], 55 | }, 56 | { 57 | 'page': 'agents/ddpg.md', 58 | 'classes': [rl.agents.DDPGAgent], 59 | }, 60 | { 61 | 'page': 'agents/sarsa.md', 62 | 'classes': [rl.agents.SARSAAgent], 63 | }, 64 | { 65 | 'page': 'agents/cem.md', 66 | 'classes': [rl.agents.CEMAgent], 67 | }, 68 | ] 69 | 70 | 71 | ROOT_MODULE_NAME = 'rl.' 72 | 73 | 74 | def get_earliest_class_that_defined_member(member, cls): 75 | ancestors = get_classes_ancestors([cls]) 76 | result = None 77 | for ancestor in ancestors: 78 | if member in dir(ancestor): 79 | result = ancestor 80 | if not result: 81 | return cls 82 | return result 83 | 84 | 85 | def get_classes_ancestors(classes): 86 | ancestors = [] 87 | for cls in classes: 88 | ancestors += cls.__bases__ 89 | filtered_ancestors = [] 90 | for ancestor in ancestors: 91 | if ancestor.__name__ in ['object']: 92 | continue 93 | filtered_ancestors.append(ancestor) 94 | if filtered_ancestors: 95 | return filtered_ancestors + get_classes_ancestors(filtered_ancestors) 96 | else: 97 | return filtered_ancestors 98 | 99 | 100 | def get_function_signature(function, method=True): 101 | signature = getattr(function, '_legacy_support_signature', None) 102 | if signature is None: 103 | signature = inspect.getargspec(function) 104 | defaults = signature.defaults 105 | if method: 106 | args = signature.args[1:] 107 | else: 108 | args = signature.args 109 | if defaults: 110 | kwargs = zip(args[-len(defaults):], defaults) 111 | args = args[:-len(defaults)] 112 | else: 113 | kwargs = [] 114 | st = '%s.%s(' % (function.__module__, function.__name__) 115 | for a in args: 116 | st += str(a) + ', ' 117 | for a, v in kwargs: 118 | if isinstance(v, str): 119 | v = '\'' + v + '\'' 120 | st += str(a) + '=' + str(v) + ', ' 121 | if kwargs or args: 122 | return st[:-2] + ')' 123 | else: 124 | return st + ')' 125 | 126 | 127 | def get_class_signature(cls): 128 | try: 129 | class_signature = get_function_signature(cls.__init__) 130 | class_signature = class_signature.replace('__init__', cls.__name__) 131 | except: 132 | # in case the class inherits from object and does not 133 | # define __init__ 134 | class_signature = cls.__module__ + '.' + cls.__name__ + '()' 135 | return class_signature 136 | 137 | 138 | def class_to_source_link(cls): 139 | module_name = cls.__module__ 140 | assert module_name.startswith(ROOT_MODULE_NAME) 141 | path = module_name.replace('.', '/') 142 | path += '.py' 143 | line = inspect.getsourcelines(cls)[-1] 144 | link = 'https://github.com/matthiasplappert/keras-rl/blob/master/' + path + '#L' + str(line) 145 | return '[[source]](' + link + ')' 146 | 147 | 148 | def function_to_source_link(fn): 149 | module_name = fn.__module__ 150 | assert module_name.startswith(ROOT_MODULE_NAME) 151 | path = module_name.replace('.', '/') 152 | path += '.py' 153 | line = inspect.getsourcelines(fn)[-1] 154 | link = 'https://github.com/matthiasplappert/keras-rl/blob/master/' + path + '#L' + str(line) 155 | return '[[source]](' + link + ')' 156 | 157 | 158 | def code_snippet(snippet): 159 | result = '```python\n' 160 | result += snippet + '\n' 161 | result += '```\n' 162 | return result 163 | 164 | 165 | def process_class_docstring(docstring): 166 | docstring = re.sub(r'\n # (.*)\n', 167 | r'\n __\1__\n\n', 168 | docstring) 169 | 170 | docstring = re.sub(r' ([^\s\\]+) \((.*)\n', 171 | r' - __\1__ (\2\n', 172 | docstring) 173 | 174 | docstring = docstring.replace(' ' * 5, '\t\t') 175 | docstring = docstring.replace(' ' * 3, '\t') 176 | docstring = docstring.replace(' ', '') 177 | return docstring 178 | 179 | 180 | def process_function_docstring(docstring): 181 | docstring = re.sub(r'\n # (.*)\n', 182 | r'\n __\1__\n\n', 183 | docstring) 184 | docstring = re.sub(r'\n # (.*)\n', 185 | r'\n __\1__\n\n', 186 | docstring) 187 | 188 | docstring = re.sub(r' ([^\s\\]+) \((.*)\n', 189 | r' - __\1__ (\2\n', 190 | docstring) 191 | 192 | docstring = docstring.replace(' ' * 6, '\t\t') 193 | docstring = docstring.replace(' ' * 4, '\t') 194 | docstring = docstring.replace(' ', '') 195 | return docstring 196 | 197 | print('Cleaning up existing sources directory.') 198 | if os.path.exists('sources'): 199 | shutil.rmtree('sources') 200 | 201 | print('Populating sources directory with templates.') 202 | for subdir, dirs, fnames in os.walk('templates'): 203 | for fname in fnames: 204 | new_subdir = subdir.replace('templates', 'sources') 205 | if not os.path.exists(new_subdir): 206 | os.makedirs(new_subdir) 207 | if fname[-3:] == '.md': 208 | fpath = os.path.join(subdir, fname) 209 | new_fpath = fpath.replace('templates', 'sources') 210 | shutil.copy(fpath, new_fpath) 211 | 212 | # Take care of index page. 213 | readme = open('../README.md').read() 214 | index = open('templates/index.md').read() 215 | index = index.replace('{{autogenerated}}', readme[readme.find('##'):]) 216 | f = open('sources/index.md', 'w') 217 | f.write(index) 218 | f.close() 219 | 220 | print('Starting autogeneration.') 221 | for page_data in PAGES: 222 | blocks = [] 223 | classes = page_data.get('classes', []) 224 | for module in page_data.get('all_module_classes', []): 225 | module_classes = [] 226 | for name in dir(module): 227 | if name[0] == '_' or name in EXCLUDE: 228 | continue 229 | module_member = getattr(module, name) 230 | if inspect.isclass(module_member): 231 | cls = module_member 232 | if cls.__module__ == module.__name__: 233 | if cls not in module_classes: 234 | module_classes.append(cls) 235 | module_classes.sort(key=lambda x: id(x)) 236 | classes += module_classes 237 | 238 | for cls in classes: 239 | subblocks = [] 240 | signature = get_class_signature(cls) 241 | subblocks.append('' + class_to_source_link(cls) + '') 242 | subblocks.append('### ' + cls.__name__ + '\n') 243 | subblocks.append(code_snippet(signature)) 244 | docstring = cls.__doc__ 245 | if docstring: 246 | subblocks.append(process_class_docstring(docstring)) 247 | blocks.append('\n'.join(subblocks)) 248 | 249 | functions = page_data.get('functions', []) 250 | for module in page_data.get('all_module_functions', []): 251 | module_functions = [] 252 | for name in dir(module): 253 | if name[0] == '_' or name in EXCLUDE: 254 | continue 255 | module_member = getattr(module, name) 256 | if inspect.isfunction(module_member): 257 | function = module_member 258 | if module.__name__ in function.__module__: 259 | if function not in module_functions: 260 | module_functions.append(function) 261 | module_functions.sort(key=lambda x: id(x)) 262 | functions += module_functions 263 | 264 | for function in functions: 265 | subblocks = [] 266 | signature = get_function_signature(function, method=False) 267 | signature = signature.replace(function.__module__ + '.', '') 268 | subblocks.append('' + function_to_source_link(function) + '') 269 | subblocks.append('### ' + function.__name__ + '\n') 270 | subblocks.append(code_snippet(signature)) 271 | docstring = function.__doc__ 272 | if docstring: 273 | subblocks.append(process_function_docstring(docstring)) 274 | blocks.append('\n\n'.join(subblocks)) 275 | 276 | if not blocks: 277 | raise RuntimeError('Found no content for page ' + 278 | page_data['page']) 279 | 280 | mkdown = '\n----\n\n'.join(blocks) 281 | # save module page. 282 | # Either insert content into existing page, 283 | # or create page otherwise 284 | page_name = page_data['page'] 285 | path = os.path.join('sources', page_name) 286 | if os.path.exists(path): 287 | template = open(path).read() 288 | assert '{{autogenerated}}' in template, ('Template found for ' + path + 289 | ' but missing {{autogenerated}} tag.') 290 | mkdown = template.replace('{{autogenerated}}', mkdown) 291 | print('...inserting autogenerated content into template:', path) 292 | else: 293 | print('...creating new page with autogenerated content:', path) 294 | subdir = os.path.dirname(path) 295 | if not os.path.exists(subdir): 296 | os.makedirs(subdir) 297 | open(path, 'w').write(mkdown) 298 | -------------------------------------------------------------------------------- /rl/agents/sarsa.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | import numpy as np 4 | 5 | from keras.callbacks import History 6 | from keras.layers import Input, Lambda 7 | import keras.backend as K 8 | 9 | from rl.core import Agent 10 | from rl.agents.dqn import mean_q 11 | from rl.util import huber_loss 12 | from rl.policy import EpsGreedyQPolicy, GreedyQPolicy 13 | from rl.util import get_object_config 14 | from rl.keras_future import Model 15 | 16 | 17 | class SARSAAgent(Agent): 18 | """Write me 19 | """ 20 | def __init__(self, model, nb_actions, policy=None, test_policy=None, gamma=.99, nb_steps_warmup=10, 21 | train_interval=1, delta_clip=np.inf, *args, **kwargs): 22 | super(SarsaAgent, self).__init__(*args, **kwargs) 23 | 24 | # Do not use defaults in constructor because that would mean that each instance shares the same 25 | # policy. 26 | if policy is None: 27 | policy = EpsGreedyQPolicy() 28 | if test_policy is None: 29 | test_policy = GreedyQPolicy() 30 | 31 | self.model = model 32 | self.nb_actions = nb_actions 33 | self.policy = policy 34 | self.test_policy = test_policy 35 | self.gamma = gamma 36 | self.nb_steps_warmup = nb_steps_warmup 37 | self.train_interval = train_interval 38 | 39 | self.delta_clip = delta_clip 40 | self.compiled = False 41 | self.actions = None 42 | self.observations = None 43 | self.rewards = None 44 | 45 | def compute_batch_q_values(self, state_batch): 46 | batch = self.process_state_batch(state_batch) 47 | q_values = self.model.predict_on_batch(batch) 48 | assert q_values.shape == (len(state_batch), self.nb_actions) 49 | return q_values 50 | 51 | def compute_q_values(self, state): 52 | q_values = self.compute_batch_q_values([state]).flatten() 53 | assert q_values.shape == (self.nb_actions,) 54 | return q_values 55 | 56 | def process_state_batch(self, batch): 57 | batch = np.array(batch) 58 | if self.processor is None: 59 | return batch 60 | return self.processor.process_state_batch(batch) 61 | 62 | def get_config(self): 63 | config = super(SarsaAgent, self).get_config() 64 | config['nb_actions'] = self.nb_actions 65 | config['gamma'] = self.gamma 66 | config['nb_steps_warmup'] = self.nb_steps_warmup 67 | config['train_interval'] = self.train_interval 68 | config['delta_clip'] = self.delta_clip 69 | config['model'] = get_object_config(self.model) 70 | config['policy'] = get_object_config(self.policy) 71 | config['test_policy'] = get_object_config(self.test_policy) 72 | return config 73 | 74 | def compile(self, optimizer, metrics=[]): 75 | metrics += [mean_q] # register default metrics 76 | 77 | def clipped_masked_error(args): 78 | y_true, y_pred, mask = args 79 | loss = huber_loss(y_true, y_pred, self.delta_clip) 80 | loss *= mask # apply element-wise mask 81 | return K.sum(loss, axis=-1) 82 | 83 | # Create trainable model. The problem is that we need to mask the output since we only 84 | # ever want to update the Q values for a certain action. The way we achieve this is by 85 | # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility 86 | # to mask out certain parameters by passing in multiple inputs to the Lambda layer. 87 | y_pred = self.model.output 88 | y_true = Input(name='y_true', shape=(self.nb_actions,)) 89 | mask = Input(name='mask', shape=(self.nb_actions,)) 90 | loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='loss')([y_pred, y_true, mask]) 91 | ins = [self.model.input] if type(self.model.input) is not list else self.model.input 92 | trainable_model = Model(input=ins + [y_true, mask], output=[loss_out, y_pred]) 93 | assert len(trainable_model.output_names) == 2 94 | combined_metrics = {trainable_model.output_names[1]: metrics} 95 | losses = [ 96 | lambda y_true, y_pred: y_pred, # loss is computed in Lambda layer 97 | lambda y_true, y_pred: K.zeros_like(y_pred), # we only include this for the metrics 98 | ] 99 | trainable_model.compile(optimizer=optimizer, loss=losses, metrics=combined_metrics) 100 | self.trainable_model = trainable_model 101 | 102 | self.compiled = True 103 | 104 | def load_weights(self, filepath): 105 | self.model.load_weights(filepath) 106 | 107 | def save_weights(self, filepath, overwrite=False): 108 | self.model.save_weights(filepath, overwrite=overwrite) 109 | 110 | def reset_states(self): 111 | self.actions = collections.deque(maxlen=2) 112 | self.observations = collections.deque(maxlen=2) 113 | self.rewards = collections.deque(maxlen=2) 114 | if self.compiled: 115 | self.model.reset_states() 116 | 117 | def forward(self, observation): 118 | # Select an action. 119 | q_values = self.compute_q_values([observation]) 120 | if self.training: 121 | action = self.policy.select_action(q_values=q_values) 122 | else: 123 | action = self.test_policy.select_action(q_values=q_values) 124 | if self.processor is not None: 125 | action = self.processor.process_action(action) 126 | 127 | # Book-keeping. 128 | self.observations.append(observation) 129 | self.actions.append(action) 130 | 131 | return action 132 | 133 | def backward(self, reward, terminal): 134 | metrics = [np.nan for _ in self.metrics_names] 135 | if not self.training: 136 | # We're done here. No need to update the experience memory since we only use the working 137 | # memory to obtain the state over the most recent observations. 138 | return metrics 139 | 140 | # Train the network on a single stochastic batch. 141 | if self.step > self.nb_steps_warmup and self.step % self.train_interval == 0: 142 | # Start by extracting the necessary parameters (we use a vectorized implementation). 143 | self.rewards.append(reward) 144 | if len(self.observations) < 2: 145 | return metrics # not enough data yet 146 | 147 | state0_batch = [self.observations[0]] 148 | reward_batch = [self.rewards[0]] 149 | action_batch = [self.actions[0]] 150 | terminal1_batch = [0.] if terminal else [1.] 151 | state1_batch = [self.observations[1]] 152 | action1_batch = [self.actions[1]] 153 | 154 | # Prepare and validate parameters. 155 | state0_batch = self.process_state_batch(state0_batch) 156 | state1_batch = self.process_state_batch(state1_batch) 157 | terminal1_batch = np.array(terminal1_batch) 158 | reward_batch = np.array(reward_batch) 159 | assert reward_batch.shape == (1,) 160 | assert terminal1_batch.shape == reward_batch.shape 161 | assert len(action_batch) == len(reward_batch) 162 | 163 | batch = self.process_state_batch(state1_batch) 164 | q_values = self.compute_q_values(batch) 165 | q_values = q_values.reshape((1, self.nb_actions)) 166 | 167 | q_batch = q_values[0, action1_batch] 168 | 169 | assert q_batch.shape == (1,) 170 | targets = np.zeros((1, self.nb_actions)) 171 | dummy_targets = np.zeros((1,)) 172 | masks = np.zeros((1, self.nb_actions)) 173 | 174 | # Compute r_t + gamma * Q(s_t+1, a_t+1) 175 | discounted_reward_batch = self.gamma * q_batch 176 | # Set discounted reward to zero for all states that were terminal. 177 | discounted_reward_batch *= terminal1_batch 178 | assert discounted_reward_batch.shape == reward_batch.shape 179 | Rs = reward_batch + discounted_reward_batch 180 | for idx, (target, mask, R, action) in enumerate(zip(targets, masks, Rs, action_batch)): 181 | target[action] = R # update action with estimated accumulated reward 182 | dummy_targets[idx] = R 183 | mask[action] = 1. # enable loss for this specific action 184 | targets = np.array(targets).astype('float32') 185 | masks = np.array(masks).astype('float32') 186 | 187 | # Finally, perform a single update on the entire batch. We use a dummy target since 188 | # the actual loss is computed in a Lambda layer that needs more complex input. However, 189 | # it is still useful to know the actual target to compute metrics properly. 190 | state0_batch = state0_batch.reshape((1,) + state0_batch.shape) 191 | ins = [state0_batch] if type(self.model.input) is not list else state0_batch 192 | metrics = self.trainable_model.train_on_batch(ins + [targets, masks], [dummy_targets, targets]) 193 | metrics = [metric for idx, metric in enumerate(metrics) if idx not in (1, 2)] # throw away individual losses 194 | metrics += self.policy.metrics 195 | if self.processor is not None: 196 | metrics += self.processor.metrics 197 | return metrics 198 | 199 | @property 200 | def layers(self): 201 | return self.model.layers[:] 202 | 203 | @property 204 | def metrics_names(self): 205 | # Throw away individual losses and replace output name since this is hidden from the user. 206 | assert len(self.trainable_model.output_names) == 2 207 | dummy_output_name = self.trainable_model.output_names[1] 208 | model_metrics = [name for idx, name in enumerate(self.trainable_model.metrics_names) if idx not in (1, 2)] 209 | model_metrics = [name.replace(dummy_output_name + '_', '') for name in model_metrics] 210 | 211 | names = model_metrics + self.policy.metrics_names[:] 212 | if self.processor is not None: 213 | names += self.processor.metrics_names[:] 214 | return names 215 | 216 | @property 217 | def policy(self): 218 | return self.__policy 219 | 220 | @policy.setter 221 | def policy(self, policy): 222 | self.__policy = policy 223 | self.__policy._set_agent(self) 224 | 225 | @property 226 | def test_policy(self): 227 | return self.__test_policy 228 | 229 | @test_policy.setter 230 | def test_policy(self, policy): 231 | self.__test_policy = policy 232 | self.__test_policy._set_agent(self) 233 | 234 | # Aliases 235 | SarsaAgent = SARSAAgent 236 | -------------------------------------------------------------------------------- /rl/memory.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from collections import deque, namedtuple 3 | import warnings 4 | import random 5 | 6 | import numpy as np 7 | 8 | 9 | # This is to be understood as a transition: Given `state0`, performing `action` 10 | # yields `reward` and results in `state1`, which might be `terminal`. 11 | Experience = namedtuple('Experience', 'state0, action, reward, state1, terminal1') 12 | 13 | 14 | def sample_batch_indexes(low, high, size): 15 | if high - low >= size: 16 | # We have enough data. Draw without replacement, that is each index is unique in the 17 | # batch. We cannot use `np.random.choice` here because it is horribly inefficient as 18 | # the memory grows. See https://github.com/numpy/numpy/issues/2764 for a discussion. 19 | # `random.sample` does the same thing (drawing without replacement) and is way faster. 20 | try: 21 | r = xrange(low, high) 22 | except NameError: 23 | r = range(low, high) 24 | batch_idxs = random.sample(r, size) 25 | else: 26 | # Not enough data. Help ourselves with sampling from the range, but the same index 27 | # can occur multiple times. This is not good and should be avoided by picking a 28 | # large enough warm-up phase. 29 | warnings.warn('Not enough entries to sample without replacement. Consider increasing your warm-up phase to avoid oversampling!') 30 | batch_idxs = np.random.random_integers(low, high - 1, size=size) 31 | assert len(batch_idxs) == size 32 | return batch_idxs 33 | 34 | 35 | class RingBuffer(object): 36 | def __init__(self, maxlen): 37 | self.maxlen = maxlen 38 | self.start = 0 39 | self.length = 0 40 | self.data = [None for _ in range(maxlen)] 41 | 42 | def __len__(self): 43 | return self.length 44 | 45 | def __getitem__(self, idx): 46 | if idx < 0 or idx >= self.length: 47 | raise KeyError() 48 | return self.data[(self.start + idx) % self.maxlen] 49 | 50 | def append(self, v): 51 | if self.length < self.maxlen: 52 | # We have space, simply increase the length. 53 | self.length += 1 54 | elif self.length == self.maxlen: 55 | # No space, "remove" the first item. 56 | self.start = (self.start + 1) % self.maxlen 57 | else: 58 | # This should never happen. 59 | raise RuntimeError() 60 | self.data[(self.start + self.length - 1) % self.maxlen] = v 61 | 62 | 63 | def zeroed_observation(observation): 64 | if hasattr(observation, 'shape'): 65 | return np.zeros(observation.shape) 66 | elif hasattr(observation, '__iter__'): 67 | out = [] 68 | for x in observation: 69 | out.append(zeroed_observation(x)) 70 | return out 71 | else: 72 | return 0. 73 | 74 | 75 | class Memory(object): 76 | def __init__(self, window_length, ignore_episode_boundaries=False): 77 | self.window_length = window_length 78 | self.ignore_episode_boundaries = ignore_episode_boundaries 79 | 80 | self.recent_observations = deque(maxlen=window_length) 81 | self.recent_terminals = deque(maxlen=window_length) 82 | 83 | def sample(self, batch_size, batch_idxs=None): 84 | raise NotImplementedError() 85 | 86 | def append(self, observation, action, reward, terminal, training=True): 87 | self.recent_observations.append(observation) 88 | self.recent_terminals.append(terminal) 89 | 90 | def get_recent_state(self, current_observation): 91 | # This code is slightly complicated by the fact that subsequent observations might be 92 | # from different episodes. We ensure that an experience never spans multiple episodes. 93 | # This is probably not that important in practice but it seems cleaner. 94 | state = [current_observation] 95 | idx = len(self.recent_observations) - 1 96 | for offset in range(0, self.window_length - 1): 97 | current_idx = idx - offset 98 | current_terminal = self.recent_terminals[current_idx - 1] if current_idx - 1 >= 0 else False 99 | if current_idx < 0 or (not self.ignore_episode_boundaries and current_terminal): 100 | # The previously handled observation was terminal, don't add the current one. 101 | # Otherwise we would leak into a different episode. 102 | break 103 | state.insert(0, self.recent_observations[current_idx]) 104 | while len(state) < self.window_length: 105 | state.insert(0, zeroed_observation(state[0])) 106 | return state 107 | 108 | def get_config(self): 109 | config = { 110 | 'window_length': self.window_length, 111 | 'ignore_episode_boundaries': self.ignore_episode_boundaries, 112 | } 113 | return config 114 | 115 | class SequentialMemory(Memory): 116 | def __init__(self, limit, **kwargs): 117 | super(SequentialMemory, self).__init__(**kwargs) 118 | 119 | self.limit = limit 120 | 121 | # Do not use deque to implement the memory. This data structure may seem convenient but 122 | # it is way too slow on random access. Instead, we use our own ring buffer implementation. 123 | self.actions = RingBuffer(limit) 124 | self.rewards = RingBuffer(limit) 125 | self.terminals = RingBuffer(limit) 126 | self.observations = RingBuffer(limit) 127 | 128 | def sample(self, batch_size, batch_idxs=None): 129 | if batch_idxs is None: 130 | # Draw random indexes such that we have at least a single entry before each 131 | # index. 132 | batch_idxs = sample_batch_indexes(0, self.nb_entries - 1, size=batch_size) 133 | batch_idxs = np.array(batch_idxs) + 1 134 | assert np.min(batch_idxs) >= 1 135 | assert np.max(batch_idxs) < self.nb_entries 136 | assert len(batch_idxs) == batch_size 137 | 138 | # Create experiences 139 | experiences = [] 140 | for idx in batch_idxs: 141 | terminal0 = self.terminals[idx - 2] if idx >= 2 else False 142 | while terminal0: 143 | # Skip this transition because the environment was reset here. Select a new, random 144 | # transition and use this instead. This may cause the batch to contain the same 145 | # transition twice. 146 | idx = sample_batch_indexes(1, self.nb_entries, size=1)[0] 147 | terminal0 = self.terminals[idx - 2] if idx >= 2 else False 148 | assert 1 <= idx < self.nb_entries 149 | 150 | # This code is slightly complicated by the fact that subsequent observations might be 151 | # from different episodes. We ensure that an experience never spans multiple episodes. 152 | # This is probably not that important in practice but it seems cleaner. 153 | state0 = [self.observations[idx - 1]] 154 | for offset in range(0, self.window_length - 1): 155 | current_idx = idx - 2 - offset 156 | current_terminal = self.terminals[current_idx - 1] if current_idx - 1 > 0 else False 157 | if current_idx < 0 or (not self.ignore_episode_boundaries and current_terminal): 158 | # The previously handled observation was terminal, don't add the current one. 159 | # Otherwise we would leak into a different episode. 160 | break 161 | state0.insert(0, self.observations[current_idx]) 162 | while len(state0) < self.window_length: 163 | state0.insert(0, zeroed_observation(state0[0])) 164 | action = self.actions[idx - 1] 165 | reward = self.rewards[idx - 1] 166 | terminal1 = self.terminals[idx - 1] 167 | 168 | # Okay, now we need to create the follow-up state. This is state0 shifted on timestep 169 | # to the right. Again, we need to be careful to not include an observation from the next 170 | # episode if the last state is terminal. 171 | state1 = [np.copy(x) for x in state0[1:]] 172 | state1.append(self.observations[idx]) 173 | 174 | assert len(state0) == self.window_length 175 | assert len(state1) == len(state0) 176 | experiences.append(Experience(state0=state0, action=action, reward=reward, 177 | state1=state1, terminal1=terminal1)) 178 | assert len(experiences) == batch_size 179 | return experiences 180 | 181 | def append(self, observation, action, reward, terminal, training=True): 182 | super(SequentialMemory, self).append(observation, action, reward, terminal, training=training) 183 | 184 | # This needs to be understood as follows: in `observation`, take `action`, obtain `reward` 185 | # and weather the next state is `terminal` or not. 186 | if training: 187 | self.observations.append(observation) 188 | self.actions.append(action) 189 | self.rewards.append(reward) 190 | self.terminals.append(terminal) 191 | 192 | @property 193 | def nb_entries(self): 194 | return len(self.observations) 195 | 196 | def get_config(self): 197 | config = super(SequentialMemory, self).get_config() 198 | config['limit'] = self.limit 199 | return config 200 | 201 | 202 | class EpisodeParameterMemory(Memory): 203 | def __init__(self, limit, **kwargs): 204 | super(EpisodeParameterMemory, self).__init__(**kwargs) 205 | self.limit = limit 206 | 207 | self.params = RingBuffer(limit) 208 | self.intermediate_rewards = [] 209 | self.total_rewards = RingBuffer(limit) 210 | 211 | def sample(self, batch_size, batch_idxs=None): 212 | if batch_idxs is None: 213 | batch_idxs = sample_batch_indexes(0, self.nb_entries, size=batch_size) 214 | assert len(batch_idxs) == batch_size 215 | 216 | batch_params = [] 217 | batch_total_rewards = [] 218 | for idx in batch_idxs: 219 | batch_params.append(self.params[idx]) 220 | batch_total_rewards.append(self.total_rewards[idx]) 221 | return batch_params, batch_total_rewards 222 | 223 | def append(self, observation, action, reward, terminal, training=True): 224 | super(EpisodeParameterMemory, self).append(observation, action, reward, terminal, training=training) 225 | if training: 226 | self.intermediate_rewards.append(reward) 227 | 228 | def finalize_episode(self, params): 229 | total_reward = sum(self.intermediate_rewards) 230 | self.total_rewards.append(total_reward) 231 | self.params.append(params) 232 | self.intermediate_rewards = [] 233 | 234 | @property 235 | def nb_entries(self): 236 | return len(self.total_rewards) 237 | 238 | def get_config(self): 239 | config = super(SequentialMemory, self).get_config() 240 | config['limit'] = self.limit 241 | return config 242 | -------------------------------------------------------------------------------- /tests/rl/test_memory.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pytest 3 | import numpy as np 4 | from numpy.testing import assert_allclose 5 | 6 | from rl.memory import SequentialMemory, RingBuffer 7 | 8 | 9 | def test_ring_buffer(): 10 | def assert_elements(b, ref): 11 | assert len(b) == len(ref) 12 | for idx in range(b.maxlen): 13 | if idx >= len(ref): 14 | with pytest.raises(KeyError): 15 | b[idx] 16 | else: 17 | assert b[idx] == ref[idx] 18 | 19 | b = RingBuffer(5) 20 | 21 | # Fill buffer. 22 | assert_elements(b, []) 23 | b.append(1) 24 | assert_elements(b, [1]) 25 | b.append(2) 26 | assert_elements(b, [1, 2]) 27 | b.append(3) 28 | assert_elements(b, [1, 2, 3]) 29 | b.append(4) 30 | assert_elements(b, [1, 2, 3, 4]) 31 | b.append(5) 32 | assert_elements(b, [1, 2, 3, 4, 5]) 33 | 34 | # Add couple more items with buffer at limit. 35 | b.append(6) 36 | assert_elements(b, [2, 3, 4, 5, 6]) 37 | b.append(7) 38 | assert_elements(b, [3, 4, 5, 6, 7]) 39 | b.append(8) 40 | assert_elements(b, [4, 5, 6, 7, 8]) 41 | 42 | 43 | def test_get_recent_state_with_episode_boundaries(): 44 | memory = SequentialMemory(3, window_length=2, ignore_episode_boundaries=False) 45 | obs_size = (3, 4) 46 | 47 | obs0 = np.random.random(obs_size) 48 | terminal0 = False 49 | 50 | obs1 = np.random.random(obs_size) 51 | terminal1 = False 52 | 53 | obs2 = np.random.random(obs_size) 54 | terminal2 = False 55 | 56 | obs3 = np.random.random(obs_size) 57 | terminal3 = True 58 | 59 | obs4 = np.random.random(obs_size) 60 | terminal4 = False 61 | 62 | obs5 = np.random.random(obs_size) 63 | terminal5 = True 64 | 65 | obs6 = np.random.random(obs_size) 66 | terminal6 = False 67 | 68 | state = np.array(memory.get_recent_state(obs0)) 69 | assert state.shape == (2,) + obs_size 70 | assert np.allclose(state[0], 0.) 71 | assert np.all(state[1] == obs0) 72 | 73 | # memory.append takes the current observation, the reward after taking an action and if 74 | # the *new* observation is terminal, thus `obs0` and `terminal1` is correct. 75 | memory.append(obs0, 0, 0., terminal1) 76 | state = np.array(memory.get_recent_state(obs1)) 77 | assert state.shape == (2,) + obs_size 78 | assert np.all(state[0] == obs0) 79 | assert np.all(state[1] == obs1) 80 | 81 | memory.append(obs1, 0, 0., terminal2) 82 | state = np.array(memory.get_recent_state(obs2)) 83 | assert state.shape == (2,) + obs_size 84 | assert np.all(state[0] == obs1) 85 | assert np.all(state[1] == obs2) 86 | 87 | memory.append(obs2, 0, 0., terminal3) 88 | state = np.array(memory.get_recent_state(obs3)) 89 | assert state.shape == (2,) + obs_size 90 | assert np.all(state[0] == obs2) 91 | assert np.all(state[1] == obs3) 92 | 93 | memory.append(obs3, 0, 0., terminal4) 94 | state = np.array(memory.get_recent_state(obs4)) 95 | assert state.shape == (2,) + obs_size 96 | assert np.all(state[0] == np.zeros(obs_size)) 97 | assert np.all(state[1] == obs4) 98 | 99 | memory.append(obs4, 0, 0., terminal5) 100 | state = np.array(memory.get_recent_state(obs5)) 101 | assert state.shape == (2,) + obs_size 102 | assert np.all(state[0] == obs4) 103 | assert np.all(state[1] == obs5) 104 | 105 | memory.append(obs5, 0, 0., terminal6) 106 | state = np.array(memory.get_recent_state(obs6)) 107 | assert state.shape == (2,) + obs_size 108 | assert np.all(state[0] == np.zeros(obs_size)) 109 | assert np.all(state[1] == obs6) 110 | 111 | 112 | def test_training_flag(): 113 | obs_size = (3, 4) 114 | 115 | obs0 = np.random.random(obs_size) 116 | terminal0 = False 117 | 118 | obs1 = np.random.random(obs_size) 119 | terminal1 = True 120 | 121 | obs2 = np.random.random(obs_size) 122 | terminal2 = False 123 | 124 | for training in (True, False): 125 | memory = SequentialMemory(3, window_length=2) 126 | 127 | state = np.array(memory.get_recent_state(obs0)) 128 | assert state.shape == (2,) + obs_size 129 | assert np.allclose(state[0], 0.) 130 | assert np.all(state[1] == obs0) 131 | assert memory.nb_entries == 0 132 | 133 | memory.append(obs0, 0, 0., terminal1, training=training) 134 | state = np.array(memory.get_recent_state(obs1)) 135 | assert state.shape == (2,) + obs_size 136 | assert np.all(state[0] == obs0) 137 | assert np.all(state[1] == obs1) 138 | if training: 139 | assert memory.nb_entries == 1 140 | else: 141 | assert memory.nb_entries == 0 142 | 143 | memory.append(obs1, 0, 0., terminal2, training=training) 144 | state = np.array(memory.get_recent_state(obs2)) 145 | assert state.shape == (2,) + obs_size 146 | assert np.allclose(state[0], 0.) 147 | assert np.all(state[1] == obs2) 148 | if training: 149 | assert memory.nb_entries == 2 150 | else: 151 | assert memory.nb_entries == 0 152 | 153 | 154 | def test_get_recent_state_without_episode_boundaries(): 155 | memory = SequentialMemory(3, window_length=2, ignore_episode_boundaries=True) 156 | obs_size = (3, 4) 157 | 158 | obs0 = np.random.random(obs_size) 159 | terminal0 = False 160 | 161 | obs1 = np.random.random(obs_size) 162 | terminal1 = False 163 | 164 | obs2 = np.random.random(obs_size) 165 | terminal2 = False 166 | 167 | obs3 = np.random.random(obs_size) 168 | terminal3 = True 169 | 170 | obs4 = np.random.random(obs_size) 171 | terminal4 = False 172 | 173 | obs5 = np.random.random(obs_size) 174 | terminal5 = True 175 | 176 | obs6 = np.random.random(obs_size) 177 | terminal6 = False 178 | 179 | state = np.array(memory.get_recent_state(obs0)) 180 | assert state.shape == (2,) + obs_size 181 | assert np.allclose(state[0], 0.) 182 | assert np.all(state[1] == obs0) 183 | 184 | # memory.append takes the current observation, the reward after taking an action and if 185 | # the *new* observation is terminal, thus `obs0` and `terminal1` is correct. 186 | memory.append(obs0, 0, 0., terminal1) 187 | state = np.array(memory.get_recent_state(obs1)) 188 | assert state.shape == (2,) + obs_size 189 | assert np.all(state[0] == obs0) 190 | assert np.all(state[1] == obs1) 191 | 192 | memory.append(obs1, 0, 0., terminal2) 193 | state = np.array(memory.get_recent_state(obs2)) 194 | assert state.shape == (2,) + obs_size 195 | assert np.all(state[0] == obs1) 196 | assert np.all(state[1] == obs2) 197 | 198 | memory.append(obs2, 0, 0., terminal3) 199 | state = np.array(memory.get_recent_state(obs3)) 200 | assert state.shape == (2,) + obs_size 201 | assert np.all(state[0] == obs2) 202 | assert np.all(state[1] == obs3) 203 | 204 | memory.append(obs3, 0, 0., terminal4) 205 | state = np.array(memory.get_recent_state(obs4)) 206 | assert state.shape == (2,) + obs_size 207 | assert np.all(state[0] == obs3) 208 | assert np.all(state[1] == obs4) 209 | 210 | memory.append(obs4, 0, 0., terminal5) 211 | state = np.array(memory.get_recent_state(obs5)) 212 | assert state.shape == (2,) + obs_size 213 | assert np.all(state[0] == obs4) 214 | assert np.all(state[1] == obs5) 215 | 216 | memory.append(obs5, 0, 0., terminal6) 217 | state = np.array(memory.get_recent_state(obs6)) 218 | assert state.shape == (2,) + obs_size 219 | assert np.all(state[0] == obs5) 220 | assert np.all(state[1] == obs6) 221 | 222 | 223 | def test_sampling(): 224 | memory = SequentialMemory(100, window_length=2, ignore_episode_boundaries=False) 225 | obs_size = (3, 4) 226 | actions = range(5) 227 | 228 | obs0 = np.random.random(obs_size) 229 | terminal0 = False 230 | action0 = np.random.choice(actions) 231 | reward0 = np.random.random() 232 | 233 | obs1 = np.random.random(obs_size) 234 | terminal1 = False 235 | action1 = np.random.choice(actions) 236 | reward1 = np.random.random() 237 | 238 | obs2 = np.random.random(obs_size) 239 | terminal2 = False 240 | action2 = np.random.choice(actions) 241 | reward2 = np.random.random() 242 | 243 | obs3 = np.random.random(obs_size) 244 | terminal3 = True 245 | action3 = np.random.choice(actions) 246 | reward3 = np.random.random() 247 | 248 | obs4 = np.random.random(obs_size) 249 | terminal4 = False 250 | action4 = np.random.choice(actions) 251 | reward4 = np.random.random() 252 | 253 | obs5 = np.random.random(obs_size) 254 | terminal5 = False 255 | action5 = np.random.choice(actions) 256 | reward5 = np.random.random() 257 | 258 | obs6 = np.random.random(obs_size) 259 | terminal6 = False 260 | action6 = np.random.choice(actions) 261 | reward6 = np.random.random() 262 | 263 | # memory.append takes the current observation, the reward after taking an action and if 264 | # the *new* observation is terminal, thus `obs0` and `terminal1` is correct. 265 | memory.append(obs0, action0, reward0, terminal1) 266 | memory.append(obs1, action1, reward1, terminal2) 267 | memory.append(obs2, action2, reward2, terminal3) 268 | memory.append(obs3, action3, reward3, terminal4) 269 | memory.append(obs4, action4, reward4, terminal5) 270 | memory.append(obs5, action5, reward5, terminal6) 271 | assert memory.nb_entries == 6 272 | 273 | experiences = memory.sample(batch_size=5, batch_idxs=[0, 1, 2, 3, 4]) 274 | assert len(experiences) == 5 275 | 276 | assert_allclose(experiences[0].state0, np.array([np.zeros(obs_size), obs0])) 277 | assert_allclose(experiences[0].state1, np.array([obs0, obs1])) 278 | assert experiences[0].action == action0 279 | assert experiences[0].reward == reward0 280 | assert experiences[0].terminal1 is False 281 | 282 | assert_allclose(experiences[1].state0, np.array([obs0, obs1])) 283 | assert_allclose(experiences[1].state1, np.array([obs1, obs2])) 284 | assert experiences[1].action == action1 285 | assert experiences[1].reward == reward1 286 | assert experiences[1].terminal1 is False 287 | 288 | assert_allclose(experiences[2].state0, np.array([obs1, obs2])) 289 | assert_allclose(experiences[2].state1, np.array([obs2, obs3])) 290 | assert experiences[2].action == action2 291 | assert experiences[2].reward == reward2 292 | assert experiences[2].terminal1 is True 293 | 294 | # Next experience has been re-sampled since since state0 would be terminal in which case we 295 | # cannot really have a meaningful transition because the environment gets reset. We thus 296 | # just ensure that state0 is not terminal. 297 | assert not np.all(experiences[3].state0 == np.array([obs2, obs3])) 298 | 299 | assert_allclose(experiences[4].state0, np.array([np.zeros(obs_size), obs4])) 300 | assert_allclose(experiences[4].state1, np.array([obs4, obs5])) 301 | assert experiences[4].action == action4 302 | assert experiences[4].reward == reward4 303 | assert experiences[4].terminal1 is False 304 | 305 | 306 | if __name__ == '__main__': 307 | pytest.main([__file__]) 308 | -------------------------------------------------------------------------------- /rl/callbacks.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | import warnings 4 | import timeit 5 | import json 6 | from tempfile import mkdtemp 7 | 8 | import numpy as np 9 | 10 | from keras.callbacks import Callback as KerasCallback, CallbackList as KerasCallbackList 11 | from keras.utils.generic_utils import Progbar 12 | 13 | 14 | class Callback(KerasCallback): 15 | def _set_env(self, env): 16 | self.env = env 17 | 18 | def on_episode_begin(self, episode, logs={}): 19 | pass 20 | 21 | def on_episode_end(self, episode, logs={}): 22 | pass 23 | 24 | def on_step_begin(self, step, logs={}): 25 | pass 26 | 27 | def on_step_end(self, step, logs={}): 28 | pass 29 | 30 | def on_action_begin(self, action, logs={}): 31 | pass 32 | 33 | def on_action_end(self, action, logs={}): 34 | pass 35 | 36 | 37 | class CallbackList(KerasCallbackList): 38 | def _set_env(self, env): 39 | for callback in self.callbacks: 40 | if callable(getattr(callback, '_set_env', None)): 41 | callback._set_env(env) 42 | 43 | def on_episode_begin(self, episode, logs={}): 44 | for callback in self.callbacks: 45 | # Check if callback supports the more appropriate `on_episode_begin` callback. 46 | # If not, fall back to `on_epoch_begin` to be compatible with built-in Keras callbacks. 47 | if callable(getattr(callback, 'on_episode_begin', None)): 48 | callback.on_episode_begin(episode, logs=logs) 49 | else: 50 | callback.on_epoch_begin(episode, logs=logs) 51 | 52 | def on_episode_end(self, episode, logs={}): 53 | for callback in self.callbacks: 54 | # Check if callback supports the more appropriate `on_episode_end` callback. 55 | # If not, fall back to `on_epoch_end` to be compatible with built-in Keras callbacks. 56 | if callable(getattr(callback, 'on_episode_end', None)): 57 | callback.on_episode_end(episode, logs=logs) 58 | else: 59 | callback.on_epoch_end(episode, logs=logs) 60 | 61 | def on_step_begin(self, step, logs={}): 62 | for callback in self.callbacks: 63 | # Check if callback supports the more appropriate `on_step_begin` callback. 64 | # If not, fall back to `on_batch_begin` to be compatible with built-in Keras callbacks. 65 | if callable(getattr(callback, 'on_step_begin', None)): 66 | callback.on_step_begin(step, logs=logs) 67 | else: 68 | callback.on_batch_begin(step, logs=logs) 69 | 70 | def on_step_end(self, step, logs={}): 71 | for callback in self.callbacks: 72 | # Check if callback supports the more appropriate `on_step_end` callback. 73 | # If not, fall back to `on_batch_end` to be compatible with built-in Keras callbacks. 74 | if callable(getattr(callback, 'on_step_end', None)): 75 | callback.on_step_end(step, logs=logs) 76 | else: 77 | callback.on_batch_end(step, logs=logs) 78 | 79 | def on_action_begin(self, action, logs={}): 80 | for callback in self.callbacks: 81 | if callable(getattr(callback, 'on_action_begin', None)): 82 | callback.on_action_begin(action, logs=logs) 83 | 84 | def on_action_end(self, action, logs={}): 85 | for callback in self.callbacks: 86 | if callable(getattr(callback, 'on_action_end', None)): 87 | callback.on_action_end(action, logs=logs) 88 | 89 | 90 | class TestLogger(Callback): 91 | def on_train_begin(self, logs): 92 | print('Testing for {} episodes ...'.format(self.params['nb_episodes'])) 93 | 94 | def on_episode_end(self, episode, logs): 95 | template = 'Episode {0}: reward: {1:.3f}, steps: {2}' 96 | variables = [ 97 | episode + 1, 98 | logs['episode_reward'], 99 | logs['nb_steps'], 100 | ] 101 | print(template.format(*variables)) 102 | 103 | 104 | class TrainEpisodeLogger(Callback): 105 | def __init__(self): 106 | # Some algorithms compute multiple episodes at once since they are multi-threaded. 107 | # We therefore use a dictionary that is indexed by the episode to separate episodes 108 | # from each other. 109 | self.episode_start = {} 110 | self.observations = {} 111 | self.rewards = {} 112 | self.actions = {} 113 | self.metrics = {} 114 | self.step = 0 115 | 116 | def on_train_begin(self, logs): 117 | self.train_start = timeit.default_timer() 118 | self.metrics_names = self.model.metrics_names 119 | print('Training for {} steps ...'.format(self.params['nb_steps'])) 120 | 121 | def on_train_end(self, logs): 122 | duration = timeit.default_timer() - self.train_start 123 | print('done, took {:.3f} seconds'.format(duration)) 124 | 125 | def on_episode_begin(self, episode, logs): 126 | self.episode_start[episode] = timeit.default_timer() 127 | self.observations[episode] = [] 128 | self.rewards[episode] = [] 129 | self.actions[episode] = [] 130 | self.metrics[episode] = [] 131 | 132 | def on_episode_end(self, episode, logs): 133 | duration = timeit.default_timer() - self.episode_start[episode] 134 | episode_steps = len(self.observations[episode]) 135 | 136 | # Format all metrics. 137 | metrics = np.array(self.metrics[episode]) 138 | metrics_template = '' 139 | metrics_variables = [] 140 | with warnings.catch_warnings(): 141 | warnings.filterwarnings('error') 142 | for idx, name in enumerate(self.metrics_names): 143 | if idx > 0: 144 | metrics_template += ', ' 145 | try: 146 | value = np.nanmean(metrics[:, idx]) 147 | metrics_template += '{}: {:f}' 148 | except Warning: 149 | value = '--' 150 | metrics_template += '{}: {}' 151 | metrics_variables += [name, value] 152 | metrics_text = metrics_template.format(*metrics_variables) 153 | 154 | nb_step_digits = str(int(np.ceil(np.log10(self.params['nb_steps']))) + 1) 155 | template = '{step: ' + nb_step_digits + 'd}/{nb_steps}: episode: {episode}, duration: {duration:.3f}s, episode steps: {episode_steps}, steps per second: {sps:.0f}, episode reward: {episode_reward:.3f}, mean reward: {reward_mean:.3f} [{reward_min:.3f}, {reward_max:.3f}], mean action: {action_mean:.3f} [{action_min:.3f}, {action_max:.3f}], mean observation: {obs_mean:.3f} [{obs_min:.3f}, {obs_max:.3f}], {metrics}' 156 | variables = { 157 | 'step': self.step, 158 | 'nb_steps': self.params['nb_steps'], 159 | 'episode': episode + 1, 160 | 'duration': duration, 161 | 'episode_steps': episode_steps, 162 | 'sps': float(episode_steps) / duration, 163 | 'episode_reward': np.sum(self.rewards[episode]), 164 | 'reward_mean': np.mean(self.rewards[episode]), 165 | 'reward_min': np.min(self.rewards[episode]), 166 | 'reward_max': np.max(self.rewards[episode]), 167 | 'action_mean': np.mean(self.actions[episode]), 168 | 'action_min': np.min(self.actions[episode]), 169 | 'action_max': np.max(self.actions[episode]), 170 | 'obs_mean': np.mean(self.observations[episode]), 171 | 'obs_min': np.min(self.observations[episode]), 172 | 'obs_max': np.max(self.observations[episode]), 173 | 'metrics': metrics_text, 174 | } 175 | print(template.format(**variables)) 176 | 177 | # Free up resources. 178 | del self.episode_start[episode] 179 | del self.observations[episode] 180 | del self.rewards[episode] 181 | del self.actions[episode] 182 | del self.metrics[episode] 183 | 184 | def on_step_end(self, step, logs): 185 | episode = logs['episode'] 186 | self.observations[episode].append(logs['observation']) 187 | self.rewards[episode].append(logs['reward']) 188 | self.actions[episode].append(logs['action']) 189 | self.metrics[episode].append(logs['metrics']) 190 | self.step += 1 191 | 192 | 193 | class TrainIntervalLogger(Callback): 194 | def __init__(self, interval=10000): 195 | self.interval = interval 196 | self.step = 0 197 | self.reset() 198 | 199 | def reset(self): 200 | self.interval_start = timeit.default_timer() 201 | self.progbar = Progbar(target=self.interval) 202 | self.metrics = [] 203 | self.infos = [] 204 | self.info_names = None 205 | self.episode_rewards = [] 206 | 207 | def on_train_begin(self, logs): 208 | self.train_start = timeit.default_timer() 209 | self.metrics_names = self.model.metrics_names 210 | print('Training for {} steps ...'.format(self.params['nb_steps'])) 211 | 212 | def on_train_end(self, logs): 213 | duration = timeit.default_timer() - self.train_start 214 | print('done, took {:.3f} seconds'.format(duration)) 215 | 216 | def on_step_begin(self, step, logs): 217 | if self.step % self.interval == 0: 218 | if len(self.episode_rewards) > 0: 219 | metrics = np.array(self.metrics) 220 | assert metrics.shape == (self.interval, len(self.metrics_names)) 221 | formatted_metrics = '' 222 | if not np.isnan(metrics).all(): # not all values are means 223 | means = np.nanmean(self.metrics, axis=0) 224 | assert means.shape == (len(self.metrics_names),) 225 | for name, mean in zip(self.metrics_names, means): 226 | formatted_metrics += ' - {}: {:.3f}'.format(name, mean) 227 | 228 | formatted_infos = '' 229 | if len(self.infos) > 0: 230 | infos = np.array(self.infos) 231 | if not np.isnan(infos).all(): # not all values are means 232 | means = np.nanmean(self.infos, axis=0) 233 | assert means.shape == (len(self.info_names),) 234 | for name, mean in zip(self.info_names, means): 235 | formatted_infos += ' - {}: {:.3f}'.format(name, mean) 236 | print('{} episodes - episode_reward: {:.3f} [{:.3f}, {:.3f}]{}{}'.format(len(self.episode_rewards), np.mean(self.episode_rewards), np.min(self.episode_rewards), np.max(self.episode_rewards), formatted_metrics, formatted_infos)) 237 | print('') 238 | self.reset() 239 | print('Interval {} ({} steps performed)'.format(self.step // self.interval + 1, self.step)) 240 | 241 | def on_step_end(self, step, logs): 242 | if self.info_names is None: 243 | self.info_names = logs['info'].keys() 244 | values = [('reward', logs['reward'])] 245 | self.progbar.update((self.step % self.interval) + 1, values=values, force=True) 246 | self.step += 1 247 | self.metrics.append(logs['metrics']) 248 | if len(self.info_names) > 0: 249 | self.infos.append([logs['info'][k] for k in self.info_names]) 250 | 251 | def on_episode_end(self, episode, logs): 252 | self.episode_rewards.append(logs['episode_reward']) 253 | 254 | 255 | class FileLogger(Callback): 256 | def __init__(self, filepath, interval=None): 257 | self.filepath = filepath 258 | self.interval = interval 259 | 260 | # Some algorithms compute multiple episodes at once since they are multi-threaded. 261 | # We therefore use a dict that maps from episode to metrics array. 262 | self.metrics = {} 263 | self.starts = {} 264 | self.data = {} 265 | 266 | def on_train_begin(self, logs): 267 | self.metrics_names = self.model.metrics_names 268 | 269 | def on_train_end(self, logs): 270 | self.save_data() 271 | 272 | def on_episode_begin(self, episode, logs): 273 | assert episode not in self.metrics 274 | assert episode not in self.starts 275 | self.metrics[episode] = [] 276 | self.starts[episode] = timeit.default_timer() 277 | 278 | def on_episode_end(self, episode, logs): 279 | duration = timeit.default_timer() - self.starts[episode] 280 | 281 | metrics = self.metrics[episode] 282 | if np.isnan(metrics).all(): 283 | mean_metrics = np.array([np.nan for _ in self.metrics_names]) 284 | else: 285 | mean_metrics = np.nanmean(metrics, axis=0) 286 | assert len(mean_metrics) == len(self.metrics_names) 287 | 288 | data = list(zip(self.metrics_names, mean_metrics)) 289 | data += list(logs.items()) 290 | data += [('episode', episode), ('duration', duration)] 291 | for key, value in data: 292 | if key not in self.data: 293 | self.data[key] = [] 294 | self.data[key].append(value) 295 | 296 | if self.interval is not None and episode % self.interval == 0: 297 | self.save_data() 298 | 299 | # Clean up. 300 | del self.metrics[episode] 301 | del self.starts[episode] 302 | 303 | def on_step_end(self, step, logs): 304 | self.metrics[logs['episode']].append(logs['metrics']) 305 | 306 | def save_data(self): 307 | if len(self.data.keys()) == 0: 308 | return 309 | 310 | # Sort everything by episode. 311 | assert 'episode' in self.data 312 | sorted_indexes = np.argsort(self.data['episode']) 313 | sorted_data = {} 314 | for key, values in self.data.items(): 315 | assert len(self.data[key]) == len(sorted_indexes) 316 | # We convert to np.array() and then to list to convert from np datatypes to native datatypes. 317 | # This is necessary because json.dump cannot handle np.float32, for example. 318 | sorted_data[key] = np.array([self.data[key][idx] for idx in sorted_indexes]).tolist() 319 | 320 | # Overwrite already open file. We can simply seek to the beginning since the file will 321 | # grow strictly monotonously. 322 | with open(self.filepath, 'w') as f: 323 | json.dump(sorted_data, f) 324 | 325 | 326 | class Visualizer(Callback): 327 | def on_action_end(self, action, logs): 328 | self.env.render(mode='human') 329 | 330 | 331 | class ModelIntervalCheckpoint(Callback): 332 | def __init__(self, filepath, interval, verbose=0): 333 | super(ModelIntervalCheckpoint, self).__init__() 334 | self.filepath = filepath 335 | self.interval = interval 336 | self.verbose = verbose 337 | self.total_steps = 0 338 | 339 | def on_step_end(self, step, logs={}): 340 | self.total_steps += 1 341 | if self.total_steps % self.interval != 0: 342 | # Nothing to do. 343 | return 344 | 345 | filepath = self.filepath.format(step=self.total_steps, **logs) 346 | if self.verbose > 0: 347 | print('Step {}: saving model to {}'.format(self.total_steps, filepath)) 348 | self.model.save_weights(filepath, overwrite=True) 349 | -------------------------------------------------------------------------------- /rl/agents/ddpg.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import deque 3 | import os 4 | import warnings 5 | 6 | import numpy as np 7 | import keras.backend as K 8 | import keras.optimizers as optimizers 9 | 10 | from rl.core import Agent 11 | from rl.random import OrnsteinUhlenbeckProcess 12 | from rl.util import * 13 | 14 | 15 | def mean_q(y_true, y_pred): 16 | return K.mean(K.max(y_pred, axis=-1)) 17 | 18 | 19 | # Deep DPG as described by Lillicrap et al. (2015) 20 | # http://arxiv.org/pdf/1509.02971v2.pdf 21 | # http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.646.4324&rep=rep1&type=pdf 22 | class DDPGAgent(Agent): 23 | """Write me 24 | """ 25 | def __init__(self, nb_actions, actor, critic, critic_action_input, memory, 26 | gamma=.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, 27 | train_interval=1, memory_interval=1, delta_range=None, delta_clip=np.inf, 28 | random_process=None, custom_model_objects={}, target_model_update=.001, **kwargs): 29 | if hasattr(actor.output, '__len__') and len(actor.output) > 1: 30 | raise ValueError('Actor "{}" has more than one output. DDPG expects an actor that has a single output.'.format(actor)) 31 | if hasattr(critic.output, '__len__') and len(critic.output) > 1: 32 | raise ValueError('Critic "{}" has more than one output. DDPG expects a critic that has a single output.'.format(critic)) 33 | if critic_action_input not in critic.input: 34 | raise ValueError('Critic "{}" does not have designated action input "{}".'.format(critic, critic_action_input)) 35 | if not hasattr(critic.input, '__len__') or len(critic.input) < 2: 36 | raise ValueError('Critic "{}" does not have enough inputs. The critic must have at exactly two inputs, one for the action and one for the observation.'.format(critic)) 37 | 38 | super(DDPGAgent, self).__init__(**kwargs) 39 | 40 | # Soft vs hard target model updates. 41 | if target_model_update < 0: 42 | raise ValueError('`target_model_update` must be >= 0.') 43 | elif target_model_update >= 1: 44 | # Hard update every `target_model_update` steps. 45 | target_model_update = int(target_model_update) 46 | else: 47 | # Soft update with `(1 - target_model_update) * old + target_model_update * new`. 48 | target_model_update = float(target_model_update) 49 | 50 | if delta_range is not None: 51 | warnings.warn('`delta_range` is deprecated. Please use `delta_clip` instead, which takes a single scalar. For now we\'re falling back to `delta_range[1] = {}`'.format(delta_range[1])) 52 | delta_clip = delta_range[1] 53 | 54 | # Parameters. 55 | self.nb_actions = nb_actions 56 | self.nb_steps_warmup_actor = nb_steps_warmup_actor 57 | self.nb_steps_warmup_critic = nb_steps_warmup_critic 58 | self.random_process = random_process 59 | self.delta_clip = delta_clip 60 | self.gamma = gamma 61 | self.target_model_update = target_model_update 62 | self.batch_size = batch_size 63 | self.train_interval = train_interval 64 | self.memory_interval = memory_interval 65 | self.custom_model_objects = custom_model_objects 66 | 67 | # Related objects. 68 | self.actor = actor 69 | self.critic = critic 70 | self.critic_action_input = critic_action_input 71 | self.critic_action_input_idx = self.critic.input.index(critic_action_input) 72 | self.memory = memory 73 | 74 | # State. 75 | self.compiled = False 76 | self.reset_states() 77 | 78 | @property 79 | def uses_learning_phase(self): 80 | return self.actor.uses_learning_phase or self.critic.uses_learning_phase 81 | 82 | def compile(self, optimizer, metrics=[]): 83 | metrics += [mean_q] 84 | 85 | if type(optimizer) in (list, tuple): 86 | if len(optimizer) != 2: 87 | raise ValueError('More than two optimizers provided. Please only provide a maximum of two optimizers, the first one for the actor and the second one for the critic.') 88 | actor_optimizer, critic_optimizer = optimizer 89 | else: 90 | actor_optimizer = optimizer 91 | critic_optimizer = clone_optimizer(optimizer) 92 | if type(actor_optimizer) is str: 93 | actor_optimizer = optimizers.get(actor_optimizer) 94 | if type(critic_optimizer) is str: 95 | critic_optimizer = optimizers.get(critic_optimizer) 96 | assert actor_optimizer != critic_optimizer 97 | 98 | if len(metrics) == 2 and hasattr(metrics[0], '__len__') and hasattr(metrics[1], '__len__'): 99 | actor_metrics, critic_metrics = metrics 100 | else: 101 | actor_metrics = critic_metrics = metrics 102 | 103 | def clipped_error(y_true, y_pred): 104 | return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1) 105 | 106 | # Compile target networks. We only use them in feed-forward mode, hence we can pass any 107 | # optimizer and loss since we never use it anyway. 108 | self.target_actor = clone_model(self.actor, self.custom_model_objects) 109 | self.target_actor.compile(optimizer='sgd', loss='mse') 110 | self.target_critic = clone_model(self.critic, self.custom_model_objects) 111 | self.target_critic.compile(optimizer='sgd', loss='mse') 112 | 113 | # We also compile the actor. We never optimize the actor using Keras but instead compute 114 | # the policy gradient ourselves. However, we need the actor in feed-forward mode, hence 115 | # we also compile it with any optimzer and 116 | self.actor.compile(optimizer='sgd', loss='mse') 117 | 118 | # Compile the critic. 119 | if self.target_model_update < 1.: 120 | # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model. 121 | critic_updates = get_soft_target_model_updates(self.target_critic, self.critic, self.target_model_update) 122 | critic_optimizer = AdditionalUpdatesOptimizer(critic_optimizer, critic_updates) 123 | self.critic.compile(optimizer=critic_optimizer, loss=clipped_error, metrics=critic_metrics) 124 | 125 | # Combine actor and critic so that we can get the policy gradient. 126 | # Assuming critic's state inputs are the same as actor's. 127 | combined_inputs = [] 128 | critic_inputs = [] 129 | for i in self.critic.input: 130 | if i == self.critic_action_input: 131 | combined_inputs.append([]) 132 | else: 133 | combined_inputs.append(i) 134 | critic_inputs.append(i) 135 | combined_inputs[self.critic_action_input_idx] = self.actor(critic_inputs) 136 | 137 | combined_output = self.critic(combined_inputs) 138 | 139 | updates = actor_optimizer.get_updates(self.actor.trainable_weights, self.actor.constraints, 140 | loss=-K.mean(combined_output)) 141 | if self.target_model_update < 1.: 142 | # Include soft target model updates. 143 | updates += get_soft_target_model_updates(self.target_actor, self.actor, self.target_model_update) 144 | updates += self.actor.updates # include other updates of the actor, e.g. for BN 145 | 146 | # Finally, combine it all into a callable function. 147 | if self.uses_learning_phase: 148 | critic_inputs += [K.learning_phase()] 149 | self.actor_train_fn = K.function(critic_inputs, [self.actor(critic_inputs)], updates=updates) 150 | self.actor_optimizer = actor_optimizer 151 | 152 | self.compiled = True 153 | 154 | def load_weights(self, filepath): 155 | filename, extension = os.path.splitext(filepath) 156 | actor_filepath = filename + '_actor' + extension 157 | critic_filepath = filename + '_critic' + extension 158 | self.actor.load_weights(actor_filepath) 159 | self.critic.load_weights(critic_filepath) 160 | self.update_target_models_hard() 161 | 162 | def save_weights(self, filepath, overwrite=False): 163 | filename, extension = os.path.splitext(filepath) 164 | actor_filepath = filename + '_actor' + extension 165 | critic_filepath = filename + '_critic' + extension 166 | self.actor.save_weights(actor_filepath, overwrite=overwrite) 167 | self.critic.save_weights(critic_filepath, overwrite=overwrite) 168 | 169 | def update_target_models_hard(self): 170 | self.target_critic.set_weights(self.critic.get_weights()) 171 | self.target_actor.set_weights(self.actor.get_weights()) 172 | 173 | # TODO: implement pickle 174 | 175 | def reset_states(self): 176 | if self.random_process is not None: 177 | self.random_process.reset_states() 178 | self.recent_action = None 179 | self.recent_observation = None 180 | if self.compiled: 181 | self.actor.reset_states() 182 | self.critic.reset_states() 183 | self.target_actor.reset_states() 184 | self.target_critic.reset_states() 185 | 186 | def process_state_batch(self, batch): 187 | batch = np.array(batch) 188 | if self.processor is None: 189 | return batch 190 | return self.processor.process_state_batch(batch) 191 | 192 | def select_action(self, state): 193 | batch = self.process_state_batch([state]) 194 | action = self.actor.predict_on_batch(batch).flatten() 195 | assert action.shape == (self.nb_actions,) 196 | 197 | # Apply noise, if a random process is set. 198 | if self.training and self.random_process is not None: 199 | noise = self.random_process.sample() 200 | assert noise.shape == action.shape 201 | action += noise 202 | 203 | return action 204 | 205 | def forward(self, observation): 206 | # Select an action. 207 | state = self.memory.get_recent_state(observation) 208 | action = self.select_action(state) # TODO: move this into policy 209 | if self.processor is not None: 210 | action = self.processor.process_action(action) 211 | 212 | # Book-keeping. 213 | self.recent_observation = observation 214 | self.recent_action = action 215 | 216 | return action 217 | 218 | @property 219 | def layers(self): 220 | return self.actor.layers[:] + self.critic.layers[:] 221 | 222 | @property 223 | def metrics_names(self): 224 | names = self.critic.metrics_names[:] 225 | if self.processor is not None: 226 | names += self.processor.metrics_names[:] 227 | return names 228 | 229 | def backward(self, reward, terminal=False): 230 | # Store most recent experience in memory. 231 | if self.step % self.memory_interval == 0: 232 | self.memory.append(self.recent_observation, self.recent_action, reward, terminal, 233 | training=self.training) 234 | 235 | metrics = [np.nan for _ in self.metrics_names] 236 | if not self.training: 237 | # We're done here. No need to update the experience memory since we only use the working 238 | # memory to obtain the state over the most recent observations. 239 | return metrics 240 | 241 | # Train the network on a single stochastic batch. 242 | can_train_either = self.step > self.nb_steps_warmup_critic or self.step > self.nb_steps_warmup_actor 243 | if can_train_either and self.step % self.train_interval == 0: 244 | experiences = self.memory.sample(self.batch_size) 245 | assert len(experiences) == self.batch_size 246 | 247 | # Start by extracting the necessary parameters (we use a vectorized implementation). 248 | state0_batch = [] 249 | reward_batch = [] 250 | action_batch = [] 251 | terminal1_batch = [] 252 | state1_batch = [] 253 | for e in experiences: 254 | state0_batch.append(e.state0) 255 | state1_batch.append(e.state1) 256 | reward_batch.append(e.reward) 257 | action_batch.append(e.action) 258 | terminal1_batch.append(0. if e.terminal1 else 1.) 259 | 260 | # Prepare and validate parameters. 261 | state0_batch = self.process_state_batch(state0_batch) 262 | state1_batch = self.process_state_batch(state1_batch) 263 | terminal1_batch = np.array(terminal1_batch) 264 | reward_batch = np.array(reward_batch) 265 | action_batch = np.array(action_batch) 266 | assert reward_batch.shape == (self.batch_size,) 267 | assert terminal1_batch.shape == reward_batch.shape 268 | assert action_batch.shape == (self.batch_size, self.nb_actions) 269 | 270 | # Update critic, if warm up is over. 271 | if self.step > self.nb_steps_warmup_critic: 272 | target_actions = self.target_actor.predict_on_batch(state1_batch) 273 | assert target_actions.shape == (self.batch_size, self.nb_actions) 274 | if len(self.critic.inputs) >= 3: 275 | state1_batch_with_action = state1_batch[:] 276 | else: 277 | state1_batch_with_action = [state1_batch] 278 | state1_batch_with_action.insert(self.critic_action_input_idx, target_actions) 279 | target_q_values = self.target_critic.predict_on_batch(state1_batch_with_action).flatten() 280 | assert target_q_values.shape == (self.batch_size,) 281 | 282 | # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target ys accordingly, 283 | # but only for the affected output units (as given by action_batch). 284 | discounted_reward_batch = self.gamma * target_q_values 285 | discounted_reward_batch *= terminal1_batch 286 | assert discounted_reward_batch.shape == reward_batch.shape 287 | targets = (reward_batch + discounted_reward_batch).reshape(self.batch_size, 1) 288 | 289 | # Perform a single batch update on the critic network. 290 | if len(self.critic.inputs) >= 3: 291 | state0_batch_with_action = state0_batch[:] 292 | else: 293 | state0_batch_with_action = [state0_batch] 294 | state0_batch_with_action.insert(self.critic_action_input_idx, action_batch) 295 | metrics = self.critic.train_on_batch(state0_batch_with_action, targets) 296 | if self.processor is not None: 297 | metrics += self.processor.metrics 298 | 299 | # Update actor, if warm up is over. 300 | if self.step > self.nb_steps_warmup_actor: 301 | # TODO: implement metrics for actor 302 | if len(self.actor.inputs) >= 2: 303 | inputs = state0_batch[:] 304 | else: 305 | inputs = [state0_batch] 306 | if self.uses_learning_phase: 307 | inputs += [self.training] 308 | action_values = self.actor_train_fn(inputs)[0] 309 | assert action_values.shape == (self.batch_size, self.nb_actions) 310 | 311 | if self.target_model_update >= 1 and self.step % self.target_model_update == 0: 312 | self.update_target_models_hard() 313 | 314 | return metrics 315 | -------------------------------------------------------------------------------- /rl/core.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import warnings 3 | from copy import deepcopy 4 | 5 | import numpy as np 6 | from keras.callbacks import History 7 | 8 | from rl.callbacks import TestLogger, TrainEpisodeLogger, TrainIntervalLogger, Visualizer, CallbackList 9 | 10 | 11 | class Agent(object): 12 | """Abstract base class for all implemented agents. 13 | 14 | Each agent interacts with the environment (as defined by the `Env` class) by first observing the 15 | state of the environment. Based on this observation the agent changes the environment by performing 16 | an action. 17 | 18 | Do not use this abstract base class directly but instead use one of the concrete agents implemented. 19 | Each agent realizes a reinforcement learning algorithm. Since all agents conform to the same 20 | interface, you can use them interchangeably. 21 | 22 | To implement your own agent, you have to implement the following methods: 23 | 24 | - `forward` 25 | - `backward` 26 | - `compile` 27 | - `load_weights` 28 | - `save_weights` 29 | - `layers` 30 | 31 | # Arguments 32 | processor (`Processor` instance): See [Processor](#processor) for details. 33 | """ 34 | def __init__(self, processor=None): 35 | self.processor = processor 36 | self.training = False 37 | self.step = 0 38 | 39 | def get_config(self): 40 | """Configuration of the agent for serialization. 41 | """ 42 | return {} 43 | 44 | def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, 45 | visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, 46 | nb_max_episode_steps=None): 47 | """Trains the agent on the given environment. 48 | 49 | # Arguments 50 | env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. 51 | nb_steps (integer): Number of training steps to be performed. 52 | action_repetition (integer): Number of times the agent repeats the same action without 53 | observing the environment again. Setting this to a value > 1 can be useful 54 | if a single action only has a very small effect on the environment. 55 | callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): 56 | List of callbacks to apply during training. See [callbacks](/callbacks) for details. 57 | verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging 58 | visualize (boolean): If `True`, the environment is visualized during training. However, 59 | this is likely going to slow down training significantly and is thus intended to be 60 | a debugging instrument. 61 | nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning 62 | of each episode using `start_step_policy`. Notice that this is an upper limit since 63 | the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] 64 | at the beginning of each episode. 65 | start_step_policy (`lambda observation: action`): The policy 66 | to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. 67 | log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. 68 | nb_max_episode_steps (integer): Number of steps per episode that the agent performs before 69 | automatically resetting the environment. Set to `None` if each episode should run 70 | (potentially indefinitely) until the environment signals a terminal state. 71 | 72 | # Returns 73 | A `keras.callbacks.History` instance that recorded the entire training process. 74 | """ 75 | if not self.compiled: 76 | raise RuntimeError('Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') 77 | if action_repetition < 1: 78 | raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) 79 | 80 | self.training = True 81 | 82 | callbacks = [] if not callbacks else callbacks[:] 83 | 84 | if verbose == 1: 85 | callbacks += [TrainIntervalLogger(interval=log_interval)] 86 | elif verbose > 1: 87 | callbacks += [TrainEpisodeLogger()] 88 | if visualize: 89 | callbacks += [Visualizer()] 90 | history = History() 91 | callbacks += [history] 92 | callbacks = CallbackList(callbacks) 93 | if hasattr(callbacks, 'set_model'): 94 | callbacks.set_model(self) 95 | else: 96 | callbacks._set_model(self) 97 | callbacks._set_env(env) 98 | params = { 99 | 'nb_steps': nb_steps, 100 | } 101 | if hasattr(callbacks, 'set_params'): 102 | callbacks.set_params(params) 103 | else: 104 | callbacks._set_params(params) 105 | self._on_train_begin() 106 | callbacks.on_train_begin() 107 | 108 | episode = 0 109 | self.step = 0 110 | observation = None 111 | episode_reward = None 112 | episode_step = None 113 | did_abort = False 114 | try: 115 | while self.step < nb_steps: 116 | if observation is None: # start of a new episode 117 | callbacks.on_episode_begin(episode) 118 | episode_step = 0 119 | episode_reward = 0. 120 | 121 | # Obtain the initial observation by resetting the environment. 122 | self.reset_states() 123 | observation = deepcopy(env.reset()) 124 | if self.processor is not None: 125 | observation = self.processor.process_observation(observation) 126 | assert observation is not None 127 | 128 | # Perform random starts at beginning of episode and do not record them into the experience. 129 | # This slightly changes the start position between games. 130 | nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) 131 | for _ in range(nb_random_start_steps): 132 | if start_step_policy is None: 133 | action = env.action_space.sample() 134 | else: 135 | action = start_step_policy(observation) 136 | if self.processor is not None: 137 | action = self.processor.process_action(action) 138 | callbacks.on_action_begin(action) 139 | observation, reward, done, info = env.step(action) 140 | observation = deepcopy(observation) 141 | if self.processor is not None: 142 | observation, reward, done, info = self.processor.process_step(observation, reward, done, info) 143 | callbacks.on_action_end(action) 144 | if done: 145 | warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) 146 | observation = deepcopy(env.reset()) 147 | if self.processor is not None: 148 | observation = self.processor.process_observation(observation) 149 | break 150 | 151 | # At this point, we expect to be fully initialized. 152 | assert episode_reward is not None 153 | assert episode_step is not None 154 | assert observation is not None 155 | 156 | # Run a single step. 157 | callbacks.on_step_begin(episode_step) 158 | # This is were all of the work happens. We first perceive and compute the action 159 | # (forward step) and then use the reward to improve (backward step). 160 | action = self.forward(observation) 161 | if self.processor is not None: 162 | action = self.processor.process_action(action) 163 | reward = 0. 164 | accumulated_info = {} 165 | done = False 166 | for _ in range(action_repetition): 167 | callbacks.on_action_begin(action) 168 | observation, r, done, info = env.step(action) 169 | observation = deepcopy(observation) 170 | if self.processor is not None: 171 | observation, r, done, info = self.processor.process_step(observation, r, done, info) 172 | for key, value in info.items(): 173 | if not np.isreal(value): 174 | continue 175 | if key not in accumulated_info: 176 | accumulated_info[key] = np.zeros_like(value) 177 | accumulated_info[key] += value 178 | callbacks.on_action_end(action) 179 | reward += r 180 | if done: 181 | break 182 | if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: 183 | # Force a terminal state. 184 | done = True 185 | metrics = self.backward(reward, terminal=done) 186 | episode_reward += reward 187 | 188 | step_logs = { 189 | 'action': action, 190 | 'observation': observation, 191 | 'reward': reward, 192 | 'metrics': metrics, 193 | 'episode': episode, 194 | 'info': accumulated_info, 195 | } 196 | callbacks.on_step_end(episode_step, step_logs) 197 | episode_step += 1 198 | self.step += 1 199 | 200 | if done: 201 | # We are in a terminal state but the agent hasn't yet seen it. We therefore 202 | # perform one more forward-backward call and simply ignore the action before 203 | # resetting the environment. We need to pass in `terminal=False` here since 204 | # the *next* state, that is the state of the newly reset environment, is 205 | # always non-terminal by convention. 206 | self.forward(observation) 207 | self.backward(0., terminal=False) 208 | 209 | # This episode is finished, report and reset. 210 | episode_logs = { 211 | 'episode_reward': episode_reward, 212 | 'nb_episode_steps': episode_step, 213 | 'nb_steps': self.step, 214 | } 215 | callbacks.on_episode_end(episode, episode_logs) 216 | 217 | episode += 1 218 | observation = None 219 | episode_step = None 220 | episode_reward = None 221 | except KeyboardInterrupt: 222 | # We catch keyboard interrupts here so that training can be be safely aborted. 223 | # This is so common that we've built this right into this function, which ensures that 224 | # the `on_train_end` method is properly called. 225 | did_abort = True 226 | callbacks.on_train_end(logs={'did_abort': did_abort}) 227 | self._on_train_end() 228 | 229 | return history 230 | 231 | def test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, 232 | nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1): 233 | """Callback that is called before training begins." 234 | """ 235 | if not self.compiled: 236 | raise RuntimeError('Your tried to test your agent but it hasn\'t been compiled yet. Please call `compile()` before `test()`.') 237 | if action_repetition < 1: 238 | raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) 239 | 240 | self.training = False 241 | self.step = 0 242 | 243 | callbacks = [] if not callbacks else callbacks[:] 244 | 245 | if verbose >= 1: 246 | callbacks += [TestLogger()] 247 | if visualize: 248 | callbacks += [Visualizer()] 249 | history = History() 250 | callbacks += [history] 251 | callbacks = CallbackList(callbacks) 252 | if hasattr(callbacks, 'set_model'): 253 | callbacks.set_model(self) 254 | else: 255 | callbacks._set_model(self) 256 | callbacks._set_env(env) 257 | params = { 258 | 'nb_episodes': nb_episodes, 259 | } 260 | if hasattr(callbacks, 'set_params'): 261 | callbacks.set_params(params) 262 | else: 263 | callbacks._set_params(params) 264 | 265 | self._on_test_begin() 266 | callbacks.on_train_begin() 267 | for episode in range(nb_episodes): 268 | callbacks.on_episode_begin(episode) 269 | episode_reward = 0. 270 | episode_step = 0 271 | 272 | # Obtain the initial observation by resetting the environment. 273 | self.reset_states() 274 | observation = deepcopy(env.reset()) 275 | if self.processor is not None: 276 | observation = self.processor.process_observation(observation) 277 | assert observation is not None 278 | 279 | # Perform random starts at beginning of episode and do not record them into the experience. 280 | # This slightly changes the start position between games. 281 | nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) 282 | for _ in range(nb_random_start_steps): 283 | if start_step_policy is None: 284 | action = env.action_space.sample() 285 | else: 286 | action = start_step_policy(observation) 287 | if self.processor is not None: 288 | action = self.processor.process_action(action) 289 | callbacks.on_action_begin(action) 290 | observation, r, done, info = env.step(action) 291 | observation = deepcopy(observation) 292 | if self.processor is not None: 293 | observation, r, done, info = self.processor.process_step(observation, r, done, info) 294 | callbacks.on_action_end(action) 295 | if done: 296 | warnings.warn('Env ended before {} random steps could be performed at the start. You should probably lower the `nb_max_start_steps` parameter.'.format(nb_random_start_steps)) 297 | observation = deepcopy(env.reset()) 298 | if self.processor is not None: 299 | observation = self.processor.process_observation(observation) 300 | break 301 | 302 | # Run the episode until we're done. 303 | done = False 304 | while not done: 305 | callbacks.on_step_begin(episode_step) 306 | 307 | action = self.forward(observation) 308 | if self.processor is not None: 309 | action = self.processor.process_action(action) 310 | reward = 0. 311 | accumulated_info = {} 312 | for _ in range(action_repetition): 313 | callbacks.on_action_begin(action) 314 | observation, r, d, info = env.step(action) 315 | observation = deepcopy(observation) 316 | if self.processor is not None: 317 | observation, r, d, info = self.processor.process_step(observation, r, d, info) 318 | callbacks.on_action_end(action) 319 | reward += r 320 | for key, value in info.items(): 321 | if not np.isreal(value): 322 | continue 323 | if key not in accumulated_info: 324 | accumulated_info[key] = np.zeros_like(value) 325 | accumulated_info[key] += value 326 | if d: 327 | done = True 328 | break 329 | if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: 330 | done = True 331 | self.backward(reward, terminal=done) 332 | episode_reward += reward 333 | 334 | step_logs = { 335 | 'action': action, 336 | 'observation': observation, 337 | 'reward': reward, 338 | 'episode': episode, 339 | 'info': accumulated_info, 340 | } 341 | callbacks.on_step_end(episode_step, step_logs) 342 | episode_step += 1 343 | self.step += 1 344 | 345 | # We are in a terminal state but the agent hasn't yet seen it. We therefore 346 | # perform one more forward-backward call and simply ignore the action before 347 | # resetting the environment. We need to pass in `terminal=False` here since 348 | # the *next* state, that is the state of the newly reset environment, is 349 | # always non-terminal by convention. 350 | self.forward(observation) 351 | self.backward(0., terminal=False) 352 | 353 | # Report end of episode. 354 | episode_logs = { 355 | 'episode_reward': episode_reward, 356 | 'nb_steps': episode_step, 357 | } 358 | callbacks.on_episode_end(episode, episode_logs) 359 | callbacks.on_train_end() 360 | self._on_test_end() 361 | 362 | return history 363 | 364 | def reset_states(self): 365 | """Resets all internally kept states after an episode is completed. 366 | """ 367 | pass 368 | 369 | def forward(self, observation): 370 | """Takes the an observation from the environment and returns the action to be taken next. 371 | If the policy is implemented by a neural network, this corresponds to a forward (inference) pass. 372 | 373 | # Argument 374 | observation (object): The current observation from the environment. 375 | 376 | # Returns 377 | The next action to be executed in the environment. 378 | """ 379 | raise NotImplementedError() 380 | 381 | def backward(self, reward, terminal): 382 | """Updates the agent after having executed the action returned by `forward`. 383 | If the policy is implemented by a neural network, this corresponds to a weight update using back-prop. 384 | 385 | # Argument 386 | reward (float): The observed reward after executing the action returned by `forward`. 387 | terminal (boolean): `True` if the new state of the environment is terminal. 388 | """ 389 | raise NotImplementedError() 390 | 391 | def compile(self, optimizer, metrics=[]): 392 | """Compiles an agent and the underlaying models to be used for training and testing. 393 | 394 | # Arguments 395 | optimizer (`keras.optimizers.Optimizer` instance): The optimizer to be used during training. 396 | metrics (list of functions `lambda y_true, y_pred: metric`): The metrics to run during training. 397 | """ 398 | raise NotImplementedError() 399 | 400 | def load_weights(self, filepath): 401 | """Loads the weights of an agent from an HDF5 file. 402 | 403 | # Arguments 404 | filepath (str): The path to the HDF5 file. 405 | """ 406 | raise NotImplementedError() 407 | 408 | def save_weights(self, filepath, overwrite=False): 409 | """Saves the weights of an agent as an HDF5 file. 410 | 411 | # Arguments 412 | filepath (str): The path to where the weights should be saved. 413 | overwrite (boolean): If `False` and `filepath` already exists, raises an error. 414 | """ 415 | raise NotImplementedError() 416 | 417 | @property 418 | def layers(self): 419 | """Returns all layers of the underlying model(s). 420 | 421 | If the concrete implementation uses multiple internal models, 422 | this method returns them in a concatenated list. 423 | """ 424 | raise NotImplementedError() 425 | 426 | @property 427 | def metrics_names(self): 428 | """The human-readable names of the agent's metrics. Must return as many names as there 429 | are metrics (see also `compile`). 430 | """ 431 | return [] 432 | 433 | def _on_train_begin(self): 434 | """Callback that is called before training begins." 435 | """ 436 | pass 437 | 438 | def _on_train_end(self): 439 | """Callback that is called after training ends." 440 | """ 441 | pass 442 | 443 | def _on_test_begin(self): 444 | """Callback that is called before testing begins." 445 | """ 446 | pass 447 | 448 | def _on_test_end(self): 449 | """Callback that is called after testing ends." 450 | """ 451 | pass 452 | 453 | 454 | class Processor(object): 455 | """Abstract base class for implementing processors. 456 | 457 | A processor acts as a coupling mechanism between an `Agent` and its `Env`. This can 458 | be necessary if your agent has different requirements with respect to the form of the 459 | observations, actions, and rewards of the environment. By implementing a custom processor, 460 | you can effectively translate between the two without having to change the underlaying 461 | implementation of the agent or environment. 462 | 463 | Do not use this abstract base class directly but instead use one of the concrete implementations 464 | or write your own. 465 | """ 466 | 467 | def process_step(self, observation, reward, done, info): 468 | """Processes an entire step by applying the processor to the observation, reward, and info arguments. 469 | 470 | # Arguments 471 | observation (object): An observation as obtained by the environment. 472 | reward (float): A reward as obtained by the environment. 473 | done (boolean): `True` if the environment is in a terminal state, `False` otherwise. 474 | info (dict): The debug info dictionary as obtained by the environment. 475 | 476 | # Returns 477 | The tupel (observation, reward, done, reward) with with all elements after being processed. 478 | """ 479 | observation = self.process_observation(observation) 480 | reward = self.process_reward(reward) 481 | info = self.process_info(info) 482 | return observation, reward, done, info 483 | 484 | def process_observation(self, observation): 485 | """Processes the observation as obtained from the environment for use in an agent and 486 | returns it. 487 | """ 488 | return observation 489 | 490 | def process_reward(self, reward): 491 | """Processes the reward as obtained from the environment for use in an agent and 492 | returns it. 493 | """ 494 | return reward 495 | 496 | def process_info(self, info): 497 | """Processes the info as obtained from the environment for use in an agent and 498 | returns it. 499 | """ 500 | return info 501 | 502 | def process_action(self, action): 503 | """Processes an action predicted by an agent but before execution in an environment. 504 | """ 505 | return action 506 | 507 | def process_state_batch(self, batch): 508 | """Processes an entire batch of states and returns it. 509 | """ 510 | return batch 511 | 512 | @property 513 | def metrics(self): 514 | """The metrics of the processor, which will be reported during training. 515 | 516 | # Returns 517 | List of `lambda y_true, y_pred: metric` functions. 518 | """ 519 | return [] 520 | 521 | @property 522 | def metrics_names(self): 523 | """The human-readable names of the agent's metrics. Must return as many names as there 524 | are metrics (see also `compile`). 525 | """ 526 | return [] 527 | 528 | 529 | # Note: the API of the `Env` and `Space` classes are taken from the OpenAI Gym implementation. 530 | # https://github.com/openai/gym/blob/master/gym/core.py 531 | 532 | 533 | class Env(object): 534 | """The abstract environment class that is used by all agents. This class has the exact 535 | same API that OpenAI Gym uses so that integrating with it is trivial. In contrast to the 536 | OpenAI Gym implementation, this class only defines the abstract methods without any actual 537 | implementation. 538 | """ 539 | reward_range = (-np.inf, np.inf) 540 | action_space = None 541 | observation_space = None 542 | 543 | def step(self, action): 544 | """Run one timestep of the environment's dynamics. 545 | Accepts an action and returns a tuple (observation, reward, done, info). 546 | 547 | # Arguments 548 | action (object): An action provided by the environment. 549 | 550 | # Returns 551 | observation (object): Agent's observation of the current environment. 552 | reward (float) : Amount of reward returned after previous action. 553 | done (boolean): Whether the episode has ended, in which case further step() calls will return undefined results. 554 | info (dict): Contains auxiliary diagnostic information (helpful for debugging, and sometimes learning). 555 | """ 556 | raise NotImplementedError() 557 | 558 | def reset(self): 559 | """ 560 | Resets the state of the environment and returns an initial observation. 561 | 562 | # Returns 563 | observation (object): The initial observation of the space. Initial reward is assumed to be 0. 564 | """ 565 | raise NotImplementedError() 566 | 567 | def render(self, mode='human', close=False): 568 | """Renders the environment. 569 | The set of supported modes varies per environment. (And some 570 | environments do not support rendering at all.) 571 | 572 | # Arguments 573 | mode (str): The mode to render with. 574 | close (bool): Close all open renderings. 575 | """ 576 | raise NotImplementedError() 577 | 578 | def close(self): 579 | """Override in your subclass to perform any necessary cleanup. 580 | Environments will automatically close() themselves when 581 | garbage collected or when the program exits. 582 | """ 583 | raise NotImplementedError() 584 | 585 | def seed(self, seed=None): 586 | """Sets the seed for this env's random number generator(s). 587 | 588 | # Returns 589 | Returns the list of seeds used in this env's random number generators 590 | """ 591 | raise NotImplementedError() 592 | 593 | def configure(self, *args, **kwargs): 594 | """Provides runtime configuration to the environment. 595 | This configuration should consist of data that tells your 596 | environment how to run (such as an address of a remote server, 597 | or path to your ImageNet data). It should not affect the 598 | semantics of the environment. 599 | """ 600 | raise NotImplementedError() 601 | 602 | def __del__(self): 603 | self.close() 604 | 605 | def __str__(self): 606 | return '<{} instance>'.format(type(self).__name__) 607 | 608 | 609 | class Space(object): 610 | """Abstract model for a space that is used for the state and action spaces. This class has the 611 | exact same API that OpenAI Gym uses so that integrating with it is trivial. 612 | """ 613 | 614 | def sample(self, seed=None): 615 | """Uniformly randomly sample a random element of this space. 616 | """ 617 | raise NotImplementedError() 618 | 619 | def contains(self, x): 620 | """Return boolean specifying if x is a valid member of this space 621 | """ 622 | raise NotImplementedError() 623 | --------------------------------------------------------------------------------