├── rl ├── __init__.py ├── agents │ ├── __init__.py │ ├── cem.py │ ├── sarsa.py │ └── ddpg.py ├── keras_future.py ├── random.py ├── processors.py ├── policy.py ├── util.py ├── memory.py ├── callbacks.py └── core.py ├── tests ├── __init__.py ├── rl │ ├── __init__.py │ ├── agents │ │ ├── __init__.py │ │ ├── test_cem.py │ │ ├── test_ddpg.py │ │ └── test_dqn.py │ ├── util.py │ ├── test_util.py │ ├── test_core.py │ └── test_memory.py └── integration │ ├── test_continuous.py │ └── test_discrete.py ├── docs ├── sources │ ├── index.md │ ├── agents │ │ ├── naf.md │ │ ├── sarsa.md │ │ ├── ddpg.md │ │ ├── cem.md │ │ ├── dqn.md │ │ └── overview.md │ ├── processors.md │ └── core.md ├── templates │ ├── index.md │ ├── core.md │ ├── processors.md │ └── agents │ │ ├── naf.md │ │ ├── ddpg.md │ │ ├── sarsa.md │ │ ├── cem.md │ │ ├── dqn.md │ │ └── overview.md ├── requirements.txt └── autogen.py ├── setup.cfg ├── assets ├── breakout.gif ├── cartpole.gif └── pendulum.gif ├── setup.py ├── mkdocs.yml ├── ISSUE_TEMPLATE.md ├── LICENSE ├── .gitignore ├── pytest.ini ├── examples ├── sarsa_cartpole.py ├── visualize_log.py ├── dqn_cartpole.py ├── cem_cartpole.py ├── duel_dqn_cartpole.py ├── ddpg_pendulum.py ├── ddpg_mujoco.py ├── naf_pendulum.py └── dqn_atari.py ├── .travis.yml └── README.md /rl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/sources/index.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/templates/index.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/rl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/rl/agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/templates/core.md: -------------------------------------------------------------------------------- 1 | {{autogenerated}} 2 | -------------------------------------------------------------------------------- /docs/templates/processors.md: -------------------------------------------------------------------------------- 1 | {{autogenerated}} 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | mkdocs 3 | python-markdown-math 4 | -------------------------------------------------------------------------------- /assets/breakout.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cafe/keras-rl/master/assets/breakout.gif -------------------------------------------------------------------------------- /assets/cartpole.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cafe/keras-rl/master/assets/cartpole.gif -------------------------------------------------------------------------------- /assets/pendulum.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cafe/keras-rl/master/assets/pendulum.gif -------------------------------------------------------------------------------- /rl/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from .dqn import DQNAgent, NAFAgent, ContinuousDQNAgent 3 | from .ddpg import DDPGAgent 4 | from .cem import CEMAgent 5 | from .sarsa import SarsaAgent, SARSAAgent 6 | -------------------------------------------------------------------------------- /docs/templates/agents/naf.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | {{autogenerated}} 6 | 7 | --- 8 | 9 | ### References 10 | - [Continuous Deep Q-Learning with Model-based Acceleration](https://arxiv.org/abs/1603.00748), Gu et al., 2016 11 | -------------------------------------------------------------------------------- /docs/templates/agents/ddpg.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | {{autogenerated}} 6 | 7 | --- 8 | 9 | ### References 10 | - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971), Lillicrap et al., 2015 11 | -------------------------------------------------------------------------------- /docs/templates/agents/sarsa.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | {{autogenerated}} 6 | 7 | --- 8 | 9 | ### References 10 | - [Reinforcement learning: An introduction](http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf), Sutton and Barto, 2011 11 | -------------------------------------------------------------------------------- /docs/templates/agents/cem.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | {{autogenerated}} 6 | 7 | --- 8 | 9 | ### References 10 | - [Learning Tetris Using the Noisy Cross-Entropy Method](http://www.mitpressjournals.org/doi/abs/10.1162/neco.2006.18.12.2936?journalCode=neco), Szita et al., 2006 11 | - [Deep Reinforcement Learning (MLSS lecture notes)](http://learning.mpi-sws.org/mlss2016/slides/2016-MLSS-RL.pdf), Schulman, 2016 12 | -------------------------------------------------------------------------------- /docs/sources/agents/naf.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/dqn.py#L548) 6 | ### NAFAgent 7 | 8 | ```python 9 | rl.agents.dqn.NAFAgent(V_model, L_model, mu_model, random_process=None, covariance_mode='full') 10 | ``` 11 | 12 | Write me 13 | 14 | 15 | --- 16 | 17 | ### References 18 | - [Continuous Deep Q-Learning with Model-based Acceleration](https://arxiv.org/abs/1603.00748), Gu et al., 2016 19 | -------------------------------------------------------------------------------- /docs/sources/agents/sarsa.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/sarsa.py#L17) 6 | ### SARSAAgent 7 | 8 | ```python 9 | rl.agents.sarsa.SARSAAgent(model, nb_actions, policy=None, test_policy=None, gamma=0.99, nb_steps_warmup=10, train_interval=1, delta_clip=inf) 10 | ``` 11 | 12 | Write me 13 | 14 | 15 | --- 16 | 17 | ### References 18 | - [Reinforcement learning: An introduction](http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf), Sutton and Barto, 2011 19 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools import find_packages 3 | 4 | 5 | setup(name='keras-rl', 6 | version='0.3.1', 7 | description='Deep Reinforcement Learning for Keras', 8 | author='Matthias Plappert', 9 | author_email='matthiasplappert@me.com', 10 | url='https://github.com/matthiasplappert/keras-rl', 11 | download_url='https://github.com/matthiasplappert/keras-rl/archive/v0.3.1.tar.gz', 12 | license='MIT', 13 | install_requires=['keras>=1.0.7,<2.0.7'], 14 | extras_require={ 15 | 'gym': ['gym'], 16 | }, 17 | packages=find_packages()) 18 | -------------------------------------------------------------------------------- /docs/templates/agents/dqn.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | {{autogenerated}} 6 | 7 | --- 8 | 9 | ### References 10 | - [Playing Atari with Deep Reinforcement Learning](https://arxiv.org/abs/1312.5602), Mnih et al., 2013 11 | - [Human-level control through deep reinforcement learning](http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html), Mnih et al., 2015 12 | - [Deep Reinforcement Learning with Double Q-learning](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Applications_files/doubledqn.pdf), van Hasselt et al., 2015 13 | - [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581), Wang et al., 2016 14 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Keras-RL Documentation 2 | theme: readthedocs 3 | docs_dir: docs/sources 4 | repo_url: https://github.com/matthiasplappert/keras-rl 5 | site_description: 'Documentation for Keras-RL, a library for Deep Reinforcement Learning with Keras.' 6 | #markdown_extensions: [mdx_math] 7 | #extra_javascript: ['https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML'] 8 | 9 | dev_addr: '0.0.0.0:8000' 10 | 11 | pages: 12 | - Home: index.md 13 | - Core: core.md 14 | - Agents: 15 | - Overview: agents/overview.md 16 | - DQNAgent: agents/dqn.md 17 | - NAFAgent: agents/naf.md 18 | - DDPGAgent: agents/ddpg.md 19 | - SARSAAgent: agents/sarsa.md 20 | - CEMAgent: agents/cem.md 21 | -------------------------------------------------------------------------------- /docs/sources/agents/ddpg.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/ddpg.py#L22) 6 | ### DDPGAgent 7 | 8 | ```python 9 | rl.agents.ddpg.DDPGAgent(nb_actions, actor, critic, critic_action_input, memory, gamma=0.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, train_interval=1, memory_interval=1, delta_range=None, delta_clip=inf, random_process=None, custom_model_objects={}, target_model_update=0.001) 10 | ``` 11 | 12 | Write me 13 | 14 | 15 | --- 16 | 17 | ### References 18 | - [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971), Lillicrap et al., 2015 19 | -------------------------------------------------------------------------------- /tests/rl/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | from rl.core import Env 5 | 6 | 7 | class MultiInputTestEnv(Env): 8 | def __init__(self, observation_shape): 9 | self.observation_shape = observation_shape 10 | 11 | def step(self, action): 12 | return self._get_obs(), random.choice([0, 1]), random.choice([True, False]), {} 13 | 14 | def reset(self): 15 | return self._get_obs() 16 | 17 | def _get_obs(self): 18 | if type(self.observation_shape) is list: 19 | return [np.random.random(s) for s in self.observation_shape] 20 | else: 21 | return np.random.random(self.observation_shape) 22 | 23 | def __del__(self): 24 | pass 25 | -------------------------------------------------------------------------------- /rl/keras_future.py: -------------------------------------------------------------------------------- 1 | import keras 2 | import keras.layers 3 | import keras.models 4 | 5 | 6 | def concatenate(x): 7 | if hasattr(keras.layers, 'Concatenate'): 8 | return keras.layers.Concatenate()(x) 9 | else: 10 | return keras.layers.merge(x, mode='concat') 11 | 12 | 13 | def add(x): 14 | if hasattr(keras.layers, 'Add'): 15 | return keras.layers.Add()(x) 16 | else: 17 | return keras.layers.merge(x, mode='sum') 18 | 19 | 20 | def Model(input, output, **kwargs): 21 | if int(keras.__version__.split('.')[0]) >= 2: 22 | return keras.models.Model(inputs=input, outputs=output, **kwargs) 23 | else: 24 | return keras.models.Model(input=input, output=output, **kwargs) 25 | -------------------------------------------------------------------------------- /docs/sources/agents/cem.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/cem.py#L12) 6 | ### CEMAgent 7 | 8 | ```python 9 | rl.agents.cem.CEMAgent(model, nb_actions, memory, batch_size=50, nb_steps_warmup=1000, train_interval=50, elite_frac=0.05, memory_interval=1, theta_init=None, noise_decay_const=0.0, noise_ampl=0.0) 10 | ``` 11 | 12 | Write me 13 | 14 | 15 | --- 16 | 17 | ### References 18 | - [Learning Tetris Using the Noisy Cross-Entropy Method](http://www.mitpressjournals.org/doi/abs/10.1162/neco.2006.18.12.2936?journalCode=neco), Szita et al., 2006 19 | - [Deep Reinforcement Learning (MLSS lecture notes)](http://learning.mpi-sws.org/mlss2016/slides/2016-MLSS-RL.pdf), Schulman, 2016 20 | -------------------------------------------------------------------------------- /ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Please make sure that the boxes below are checked before you submit your issue. If your issue is an implementation question, please ask your question in the [Keras-RL Google group](https://groups.google.com/forum/#!forum/keras-rl-users) or [join the Keras-RL Gitter channel](https://gitter.im/keras-rl/Lobby) and ask there instead of filing a GitHub issue. 2 | 3 | Thank you! 4 | 5 | - [ ] Check that you are up-to-date with the master branch of Keras-RL. You can update with: 6 | `pip install git+git://github.com/matthiasplappert/keras-rl.git --upgrade --no-deps` 7 | 8 | - [ ] Check that you are up-to-date with the master branch of Keras. You can update with: 9 | `pip install git+git://github.com/fchollet/keras.git --upgrade --no-deps` 10 | 11 | - [ ] Provide a link to a GitHub Gist of a Python script that can reproduce your issue (or just copy the script here if it is short). If you report an error, please include the error message and the backtrace. 12 | -------------------------------------------------------------------------------- /docs/sources/agents/dqn.md: -------------------------------------------------------------------------------- 1 | ### Introduction 2 | 3 | --- 4 | 5 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/agents/dqn.py#L89) 6 | ### DQNAgent 7 | 8 | ```python 9 | rl.agents.dqn.DQNAgent(model, policy=None, test_policy=None, enable_double_dqn=True, enable_dueling_network=False, dueling_type='avg') 10 | ``` 11 | 12 | Write me 13 | 14 | 15 | --- 16 | 17 | ### References 18 | - [Playing Atari with Deep Reinforcement Learning](https://arxiv.org/abs/1312.5602), Mnih et al., 2013 19 | - [Human-level control through deep reinforcement learning](http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html), Mnih et al., 2015 20 | - [Deep Reinforcement Learning with Double Q-learning](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Applications_files/doubledqn.pdf), van Hasselt et al., 2015 21 | - [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581), Wang et al., 2016 22 | -------------------------------------------------------------------------------- /docs/templates/agents/overview.md: -------------------------------------------------------------------------------- 1 | ## Available Agents 2 | 3 | | Name | Implementation | Observation Space | Action Space | 4 | | ---------------------- |------------------------| -------------------| ---------------| 5 | | [DQN](/agents/dqn) | `rl.agents.DQNAgent` | discrete or continuous | discrete | 6 | | [DDPG](/agents/ddpg) | `rl.agents.DDPGAgent` | discrete or continuous | continuous | 7 | | [NAF](/agents/naf) | `rl.agents.NAFAgent` | discrete or continuous | continuous | 8 | | [CEM](/agents/cem) | `rl.agents.CEMAgent` | discrete or continuous | discrete | 9 | | [SARSA](/agents/sarsa) | `rl.agents.SARSAAgent` | discrete or continuous | discrete | 10 | 11 | --- 12 | 13 | ## Common API 14 | 15 | All agents share a common API. This allows you to easily switch between different agents. 16 | That being said, keep in mind that some agents make assumptions regarding the action space, i.e. assume discrete 17 | or continuous actions. 18 | 19 | {{autogenerated}} 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Matthias Plappert 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # OS X 2 | .DS_Store 3 | docs/site/* 4 | 5 | # Ubuntu 6 | *~ 7 | 8 | # PyCharm 9 | .idea 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | env/ 21 | build/ 22 | download/ 23 | bin/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | include/ 33 | lib/ 34 | man/ 35 | local/ 36 | var/ 37 | share/ 38 | pip-selfcheck.json 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | 43 | # PyInstaller 44 | # Usually these files are written by a python script from a template 45 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 46 | *.manifest 47 | *.spec 48 | 49 | # Installer logs 50 | pip-log.txt 51 | pip-delete-this-directory.txt 52 | 53 | # Unit test / coverage reports 54 | htmlcov/ 55 | .tox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *,cover 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # Configuration of py.test 2 | [pytest] 3 | addopts=-v 4 | -n 2 5 | --durations=10 6 | --cov-report term-missing 7 | --cov=rl 8 | 9 | # Do not run tests in the build folder or in the virtualenv folder `venv`. 10 | norecursedirs=build venv 11 | 12 | # PEP-8 The following are ignored: 13 | # E251 unexpected spaces around keyword / parameter equals 14 | # E225 missing whitespace around operator 15 | # E226 missing whitespace around arithmetic operator 16 | # W291 trailing whitespace 17 | # W293 blank line contains whitespace 18 | # E501 line too long (82 > 79 characters) 19 | # E402 module level import not at top of file - temporary measure to coninue adding ros python packaged in sys.path 20 | # E731 do not assign a lambda expression, use a def 21 | # E302 two blank lines between the functions 22 | # E231 missing whitespace after , 23 | # E241 multiple spaces after ',' 24 | # E261 at least two spaces before inline comment 25 | 26 | 27 | pep8ignore=* E251 \ 28 | * E225 \ 29 | * E226 \ 30 | * W291 \ 31 | * W293 \ 32 | * E501 \ 33 | * E402 \ 34 | * E731 \ 35 | * E302 \ 36 | * E231 \ 37 | * E241 \ 38 | * E261 39 | -------------------------------------------------------------------------------- /tests/rl/agents/test_cem.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import absolute_import 3 | 4 | import pytest 5 | import numpy as np 6 | from numpy.testing import assert_allclose 7 | 8 | from keras.models import Model, Sequential 9 | from keras.layers import Input, merge, Dense, Flatten 10 | 11 | from rl.agents.cem import CEMAgent 12 | from rl.memory import EpisodeParameterMemory 13 | from rl.processors import MultiInputProcessor 14 | 15 | from ..util import MultiInputTestEnv 16 | 17 | 18 | def test_single_cem_input(): 19 | model = Sequential() 20 | model.add(Flatten(input_shape=(2, 3))) 21 | model.add(Dense(2)) 22 | 23 | memory = EpisodeParameterMemory(limit=10, window_length=2) 24 | agent = CEMAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, train_interval=50) 25 | agent.compile() 26 | agent.fit(MultiInputTestEnv((3,)), nb_steps=100) 27 | 28 | 29 | def test_multi_cem_input(): 30 | input1 = Input(shape=(2, 3)) 31 | input2 = Input(shape=(2, 4)) 32 | x = merge([input1, input2], mode='concat') 33 | x = Flatten()(x) 34 | x = Dense(2)(x) 35 | model = Model(input=[input1, input2], output=x) 36 | 37 | memory = EpisodeParameterMemory(limit=10, window_length=2) 38 | processor = MultiInputProcessor(nb_inputs=2) 39 | agent = CEMAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, 40 | processor=processor, train_interval=50) 41 | agent.compile() 42 | agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=100) 43 | -------------------------------------------------------------------------------- /examples/sarsa_cartpole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Activation, Flatten 6 | from keras.optimizers import Adam 7 | 8 | from rl.agents import SARSAAgent 9 | from rl.policy import BoltzmannQPolicy 10 | 11 | 12 | ENV_NAME = 'CartPole-v0' 13 | 14 | # Get the environment and extract the number of actions. 15 | env = gym.make(ENV_NAME) 16 | np.random.seed(123) 17 | env.seed(123) 18 | nb_actions = env.action_space.n 19 | 20 | # Next, we build a very simple model. 21 | model = Sequential() 22 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 23 | model.add(Dense(16)) 24 | model.add(Activation('relu')) 25 | model.add(Dense(16)) 26 | model.add(Activation('relu')) 27 | model.add(Dense(16)) 28 | model.add(Activation('relu')) 29 | model.add(Dense(nb_actions)) 30 | model.add(Activation('linear')) 31 | print(model.summary()) 32 | 33 | # SARSA does not require a memory. 34 | policy = BoltzmannQPolicy() 35 | sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) 36 | sarsa.compile(Adam(lr=1e-3), metrics=['mae']) 37 | 38 | # Okay, now it's time to learn something! We visualize the training here for show, but this 39 | # slows down training quite a lot. You can always safely abort the training prematurely using 40 | # Ctrl + C. 41 | sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2) 42 | 43 | # After training is done, we save the final weights. 44 | sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True) 45 | 46 | # Finally, evaluate our algorithm for 5 episodes. 47 | sarsa.test(env, nb_episodes=5, visualize=True) 48 | -------------------------------------------------------------------------------- /examples/visualize_log.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def visualize_log(filename, figsize=None, output=None): 8 | with open(filename, 'r') as f: 9 | data = json.load(f) 10 | if 'episode' not in data: 11 | raise ValueError('Log file "{}" does not contain the "episode" key.'.format(filename)) 12 | episodes = data['episode'] 13 | 14 | # Get value keys. The x axis is shared and is the number of episodes. 15 | keys = sorted(list(set(data.keys()).difference(set(['episode'])))) 16 | 17 | if figsize is None: 18 | figsize = (15., 5. * len(keys)) 19 | f, axarr = plt.subplots(len(keys), sharex=True, figsize=figsize) 20 | for idx, key in enumerate(keys): 21 | axarr[idx].plot(episodes, data[key]) 22 | axarr[idx].set_ylabel(key) 23 | plt.xlabel('episodes') 24 | plt.tight_layout() 25 | if output is None: 26 | plt.show() 27 | else: 28 | plt.savefig(output) 29 | 30 | 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('filename', type=str, help='The filename of the JSON log generated during training.') 33 | parser.add_argument('--output', type=str, default=None, help='The output file. If not specified, the log will only be displayed.') 34 | parser.add_argument('--figsize', nargs=2, type=float, default=None, help='The size of the figure in `width height` format specified in points.') 35 | args = parser.parse_args() 36 | 37 | # You can use visualize_log to easily view the stats that were recorded during training. Simply 38 | # provide the filename of the `FileLogger` that was used in `FileLogger`. 39 | visualize_log(args.filename, output=args.output, figsize=args.figsize) 40 | -------------------------------------------------------------------------------- /examples/dqn_cartpole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Activation, Flatten 6 | from keras.optimizers import Adam 7 | 8 | from rl.agents.dqn import DQNAgent 9 | from rl.policy import BoltzmannQPolicy 10 | from rl.memory import SequentialMemory 11 | 12 | 13 | ENV_NAME = 'CartPole-v0' 14 | 15 | 16 | # Get the environment and extract the number of actions. 17 | env = gym.make(ENV_NAME) 18 | np.random.seed(123) 19 | env.seed(123) 20 | nb_actions = env.action_space.n 21 | 22 | # Next, we build a very simple model. 23 | model = Sequential() 24 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 25 | model.add(Dense(16)) 26 | model.add(Activation('relu')) 27 | model.add(Dense(16)) 28 | model.add(Activation('relu')) 29 | model.add(Dense(16)) 30 | model.add(Activation('relu')) 31 | model.add(Dense(nb_actions)) 32 | model.add(Activation('linear')) 33 | print(model.summary()) 34 | 35 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 36 | # even the metrics! 37 | memory = SequentialMemory(limit=50000, window_length=1) 38 | policy = BoltzmannQPolicy() 39 | dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, 40 | target_model_update=1e-2, policy=policy) 41 | dqn.compile(Adam(lr=1e-3), metrics=['mae']) 42 | 43 | # Okay, now it's time to learn something! We visualize the training here for show, but this 44 | # slows down training quite a lot. You can always safely abort the training prematurely using 45 | # Ctrl + C. 46 | dqn.fit(env, nb_steps=50000, visualize=True, verbose=2) 47 | 48 | # After training is done, we save the final weights. 49 | dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) 50 | 51 | # Finally, evaluate our algorithm for 5 episodes. 52 | dqn.test(env, nb_episodes=5, visualize=True) 53 | -------------------------------------------------------------------------------- /docs/sources/processors.md: -------------------------------------------------------------------------------- 1 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/processors.py#L7) 2 | ### MultiInputProcessor 3 | 4 | ```python 5 | rl.processors.MultiInputProcessor(nb_inputs) 6 | ``` 7 | 8 | Converts observations from an environment with multiple observations for use in a neural network 9 | policy. 10 | 11 | In some cases, you have environments that return multiple different observations per timestep 12 | (in a robotics context, for example, a camera may be used to view the scene and a joint encoder may 13 | be used to report the angles for each joint). Usually, this can be handled by a policy that has 14 | multiple inputs, one for each modality. However, observations are returned by the environment 15 | in the form of a tuple `[(modality1_t, modality2_t, ..., modalityn_t) for t in T]` but the neural network 16 | expects them in per-modality batches like so: `[[modality1_1, ..., modality1_T], ..., [[modalityn_1, ..., modalityn_T]]`. 17 | This processor converts observations appropriate for this use case. 18 | 19 | __Arguments__ 20 | 21 | - __nb_inputs__ (integer): The number of inputs, that is different modalities, to be used. 22 | Your neural network that you use for the policy must have a corresponding number of 23 | inputs. 24 | 25 | ---- 26 | 27 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/processors.py#L40) 28 | ### WhiteningNormalizerProcessor 29 | 30 | ```python 31 | rl.processors.WhiteningNormalizerProcessor() 32 | ``` 33 | 34 | Normalizes the observations to have zero mean and standard deviation of one, 35 | i.e. it applies whitening to the inputs. 36 | 37 | This typically helps significantly with learning, especially if different dimensions are 38 | on different scales. However, it complicates training in the sense that you will have to store 39 | these weights alongside the policy if you intend to load it later. It is the responsibility of 40 | the user to do so. 41 | 42 | -------------------------------------------------------------------------------- /examples/cem_cartpole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Activation, Flatten 6 | from keras.optimizers import Adam 7 | 8 | from rl.agents.cem import CEMAgent 9 | from rl.memory import EpisodeParameterMemory 10 | 11 | ENV_NAME = 'CartPole-v0' 12 | 13 | 14 | # Get the environment and extract the number of actions. 15 | env = gym.make(ENV_NAME) 16 | np.random.seed(123) 17 | env.seed(123) 18 | 19 | nb_actions = env.action_space.n 20 | obs_dim = env.observation_space.shape[0] 21 | 22 | # Option 1 : Simple model 23 | model = Sequential() 24 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 25 | model.add(Dense(nb_actions)) 26 | model.add(Activation('softmax')) 27 | 28 | # Option 2: deep network 29 | # model = Sequential() 30 | # model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 31 | # model.add(Dense(16)) 32 | # model.add(Activation('relu')) 33 | # model.add(Dense(16)) 34 | # model.add(Activation('relu')) 35 | # model.add(Dense(16)) 36 | # model.add(Activation('relu')) 37 | # model.add(Dense(nb_actions)) 38 | # model.add(Activation('softmax')) 39 | 40 | 41 | print(model.summary()) 42 | 43 | 44 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 45 | # even the metrics! 46 | memory = EpisodeParameterMemory(limit=1000, window_length=1) 47 | 48 | cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, 49 | batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) 50 | cem.compile() 51 | 52 | # Okay, now it's time to learn something! We visualize the training here for show, but this 53 | # slows down training quite a lot. You can always safely abort the training prematurely using 54 | # Ctrl + C. 55 | cem.fit(env, nb_steps=100000, visualize=False, verbose=2) 56 | 57 | # After training is done, we save the best weights. 58 | cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True) 59 | 60 | # Finally, evaluate our algorithm for 5 episodes. 61 | cem.test(env, nb_episodes=5, visualize=True) 62 | -------------------------------------------------------------------------------- /examples/duel_dqn_cartpole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Activation, Flatten 6 | from keras.optimizers import Adam 7 | 8 | from rl.agents.dqn import DQNAgent 9 | from rl.policy import BoltzmannQPolicy 10 | from rl.memory import SequentialMemory 11 | 12 | 13 | ENV_NAME = 'CartPole-v0' 14 | 15 | 16 | # Get the environment and extract the number of actions. 17 | env = gym.make(ENV_NAME) 18 | np.random.seed(123) 19 | env.seed(123) 20 | nb_actions = env.action_space.n 21 | 22 | # Next, we build a very simple model regardless of the dueling architecture 23 | # if you enable dueling network in DQN , DQN will build a dueling network base on your model automatically 24 | # Also, you can build a dueling network by yourself and turn off the dueling network in DQN. 25 | model = Sequential() 26 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 27 | model.add(Dense(16)) 28 | model.add(Activation('relu')) 29 | model.add(Dense(16)) 30 | model.add(Activation('relu')) 31 | model.add(Dense(16)) 32 | model.add(Activation('relu')) 33 | model.add(Dense(nb_actions, activation='linear')) 34 | print(model.summary()) 35 | 36 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 37 | # even the metrics! 38 | memory = SequentialMemory(limit=50000, window_length=1) 39 | policy = BoltzmannQPolicy() 40 | # enable the dueling network 41 | # you can specify the dueling_type to one of {'avg','max','naive'} 42 | dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, 43 | enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy) 44 | dqn.compile(Adam(lr=1e-3), metrics=['mae']) 45 | 46 | # Okay, now it's time to learn something! We visualize the training here for show, but this 47 | # slows down training quite a lot. You can always safely abort the training prematurely using 48 | # Ctrl + C. 49 | dqn.fit(env, nb_steps=50000, visualize=False, verbose=2) 50 | 51 | # After training is done, we save the final weights. 52 | dqn.save_weights('duel_dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) 53 | 54 | # Finally, evaluate our algorithm for 5 episodes. 55 | dqn.test(env, nb_episodes=5, visualize=False) 56 | -------------------------------------------------------------------------------- /rl/random.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | 4 | 5 | class RandomProcess(object): 6 | def reset_states(self): 7 | pass 8 | 9 | 10 | class AnnealedGaussianProcess(RandomProcess): 11 | def __init__(self, mu, sigma, sigma_min, n_steps_annealing): 12 | self.mu = mu 13 | self.sigma = sigma 14 | self.n_steps = 0 15 | 16 | if sigma_min is not None: 17 | self.m = -float(sigma - sigma_min) / float(n_steps_annealing) 18 | self.c = sigma 19 | self.sigma_min = sigma_min 20 | else: 21 | self.m = 0. 22 | self.c = sigma 23 | self.sigma_min = sigma 24 | 25 | @property 26 | def current_sigma(self): 27 | sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c) 28 | return sigma 29 | 30 | 31 | class GaussianWhiteNoiseProcess(AnnealedGaussianProcess): 32 | def __init__(self, mu=0., sigma=1., sigma_min=None, n_steps_annealing=1000, size=1): 33 | super(GaussianWhiteNoiseProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing) 34 | self.size = size 35 | 36 | def sample(self): 37 | sample = np.random.normal(self.mu, self.current_sigma, self.size) 38 | self.n_steps += 1 39 | return sample 40 | 41 | # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab 42 | class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess): 43 | def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000): 44 | super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing) 45 | self.theta = theta 46 | self.mu = mu 47 | self.dt = dt 48 | self.x0 = x0 49 | self.size = size 50 | self.reset_states() 51 | 52 | def sample(self): 53 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size) 54 | self.x_prev = x 55 | self.n_steps += 1 56 | return x 57 | 58 | def reset_states(self): 59 | self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size) 60 | -------------------------------------------------------------------------------- /docs/sources/core.md: -------------------------------------------------------------------------------- 1 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L11) 2 | ### Agent 3 | 4 | ```python 5 | rl.core.Agent(processor=None) 6 | ``` 7 | 8 | Abstract base class for all implemented agents. 9 | 10 | Each agent interacts with the environment (as defined by the `Env` class) by first observing the 11 | state of the environment. Based on this observation the agent changes the environment by performing 12 | an action. 13 | 14 | Do not use this abstract base class directly but instead use one of the concrete agents implemented. 15 | Each agent realizes a reinforcement learning algorithm. Since all agents conform to the same 16 | interface, you can use them interchangeably. 17 | 18 | To implement your own agent, you have to implement the following methods: 19 | 20 | - `forward` 21 | - `backward` 22 | - `compile` 23 | - `load_weights` 24 | - `save_weights` 25 | - `layers` 26 | 27 | __Arguments__ 28 | 29 | - __processor__ (`Processor` instance): See [Processor](#processor) for details. 30 | 31 | ---- 32 | 33 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L454) 34 | ### Processor 35 | 36 | ```python 37 | rl.core.Processor() 38 | ``` 39 | 40 | Abstract base class for implementing processors. 41 | 42 | A processor acts as a coupling mechanism between an `Agent` and its `Env`. This can 43 | be necessary if your agent has different requirements with respect to the form of the 44 | observations, actions, and rewards of the environment. By implementing a custom processor, 45 | you can effectively translate between the two without having to change the underlaying 46 | implementation of the agent or environment. 47 | 48 | Do not use this abstract base class directly but instead use one of the concrete implementations 49 | or write your own. 50 | 51 | ---- 52 | 53 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L533) 54 | ### Env 55 | 56 | ```python 57 | rl.core.Env() 58 | ``` 59 | 60 | The abstract environment class that is used by all agents. This class has the exact 61 | same API that OpenAI Gym uses so that integrating with it is trivial. In contrast to the 62 | OpenAI Gym implementation, this class only defines the abstract methods without any actual 63 | implementation. 64 | 65 | ---- 66 | 67 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L609) 68 | ### Space 69 | 70 | ```python 71 | rl.core.Space() 72 | ``` 73 | 74 | Abstract model for a space that is used for the state and action spaces. This class has the 75 | exact same API that OpenAI Gym uses so that integrating with it is trivial. 76 | 77 | -------------------------------------------------------------------------------- /rl/processors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rl.core import Processor 4 | from rl.util import WhiteningNormalizer 5 | 6 | 7 | class MultiInputProcessor(Processor): 8 | """Converts observations from an environment with multiple observations for use in a neural network 9 | policy. 10 | 11 | In some cases, you have environments that return multiple different observations per timestep 12 | (in a robotics context, for example, a camera may be used to view the scene and a joint encoder may 13 | be used to report the angles for each joint). Usually, this can be handled by a policy that has 14 | multiple inputs, one for each modality. However, observations are returned by the environment 15 | in the form of a tuple `[(modality1_t, modality2_t, ..., modalityn_t) for t in T]` but the neural network 16 | expects them in per-modality batches like so: `[[modality1_1, ..., modality1_T], ..., [[modalityn_1, ..., modalityn_T]]`. 17 | This processor converts observations appropriate for this use case. 18 | 19 | # Arguments 20 | nb_inputs (integer): The number of inputs, that is different modalities, to be used. 21 | Your neural network that you use for the policy must have a corresponding number of 22 | inputs. 23 | """ 24 | def __init__(self, nb_inputs): 25 | self.nb_inputs = nb_inputs 26 | 27 | def process_state_batch(self, state_batch): 28 | input_batches = [[] for x in range(self.nb_inputs)] 29 | for state in state_batch: 30 | processed_state = [[] for x in range(self.nb_inputs)] 31 | for observation in state: 32 | assert len(observation) == self.nb_inputs 33 | for o, s in zip(observation, processed_state): 34 | s.append(o) 35 | for idx, s in enumerate(processed_state): 36 | input_batches[idx].append(s) 37 | return [np.array(x) for x in input_batches] 38 | 39 | 40 | class WhiteningNormalizerProcessor(Processor): 41 | """Normalizes the observations to have zero mean and standard deviation of one, 42 | i.e. it applies whitening to the inputs. 43 | 44 | This typically helps significantly with learning, especially if different dimensions are 45 | on different scales. However, it complicates training in the sense that you will have to store 46 | these weights alongside the policy if you intend to load it later. It is the responsibility of 47 | the user to do so. 48 | """ 49 | def __init__(self): 50 | self.normalizer = None 51 | 52 | def process_state_batch(self, batch): 53 | if self.normalizer is None: 54 | self.normalizer = WhiteningNormalizer(shape=batch.shape[1:], dtype=batch.dtype) 55 | self.normalizer.update(batch) 56 | return self.normalizer.normalize(batch) 57 | -------------------------------------------------------------------------------- /examples/ddpg_pendulum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | from keras.models import Sequential, Model 5 | from keras.layers import Dense, Activation, Flatten, Input, merge 6 | from keras.optimizers import Adam 7 | 8 | from rl.agents import DDPGAgent 9 | from rl.memory import SequentialMemory 10 | from rl.random import OrnsteinUhlenbeckProcess 11 | 12 | 13 | ENV_NAME = 'Pendulum-v0' 14 | gym.undo_logger_setup() 15 | 16 | 17 | # Get the environment and extract the number of actions. 18 | env = gym.make(ENV_NAME) 19 | np.random.seed(123) 20 | env.seed(123) 21 | assert len(env.action_space.shape) == 1 22 | nb_actions = env.action_space.shape[0] 23 | 24 | # Next, we build a very simple model. 25 | actor = Sequential() 26 | actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 27 | actor.add(Dense(16)) 28 | actor.add(Activation('relu')) 29 | actor.add(Dense(16)) 30 | actor.add(Activation('relu')) 31 | actor.add(Dense(16)) 32 | actor.add(Activation('relu')) 33 | actor.add(Dense(nb_actions)) 34 | actor.add(Activation('linear')) 35 | print(actor.summary()) 36 | 37 | action_input = Input(shape=(nb_actions,), name='action_input') 38 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') 39 | flattened_observation = Flatten()(observation_input) 40 | x = merge([action_input, flattened_observation], mode='concat') 41 | x = Dense(32)(x) 42 | x = Activation('relu')(x) 43 | x = Dense(32)(x) 44 | x = Activation('relu')(x) 45 | x = Dense(32)(x) 46 | x = Activation('relu')(x) 47 | x = Dense(1)(x) 48 | x = Activation('linear')(x) 49 | critic = Model(input=[action_input, observation_input], output=x) 50 | print(critic.summary()) 51 | 52 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 53 | # even the metrics! 54 | memory = SequentialMemory(limit=100000, window_length=1) 55 | random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) 56 | agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, 57 | memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, 58 | random_process=random_process, gamma=.99, target_model_update=1e-3) 59 | agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) 60 | 61 | # Okay, now it's time to learn something! We visualize the training here for show, but this 62 | # slows down training quite a lot. You can always safely abort the training prematurely using 63 | # Ctrl + C. 64 | agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=200) 65 | 66 | # After training is done, we save the final weights. 67 | agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) 68 | 69 | # Finally, evaluate our algorithm for 5 episodes. 70 | agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200) 71 | -------------------------------------------------------------------------------- /tests/rl/agents/test_ddpg.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import absolute_import 3 | 4 | import pytest 5 | import numpy as np 6 | from numpy.testing import assert_allclose 7 | 8 | from keras.models import Model, Sequential 9 | from keras.layers import Input, merge, Dense, Flatten 10 | 11 | from rl.agents.ddpg import DDPGAgent 12 | from rl.memory import SequentialMemory 13 | from rl.processors import MultiInputProcessor 14 | 15 | from ..util import MultiInputTestEnv 16 | 17 | 18 | def test_single_ddpg_input(): 19 | nb_actions = 2 20 | 21 | actor = Sequential() 22 | actor.add(Flatten(input_shape=(2, 3))) 23 | actor.add(Dense(nb_actions)) 24 | 25 | action_input = Input(shape=(nb_actions,), name='action_input') 26 | observation_input = Input(shape=(2, 3), name='observation_input') 27 | x = merge([action_input, Flatten()(observation_input)], mode='concat') 28 | x = Dense(1)(x) 29 | critic = Model(input=[action_input, observation_input], output=x) 30 | 31 | memory = SequentialMemory(limit=10, window_length=2) 32 | agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=action_input, memory=memory, 33 | nb_actions=2, nb_steps_warmup_critic=5, nb_steps_warmup_actor=5, batch_size=4) 34 | agent.compile('sgd') 35 | agent.fit(MultiInputTestEnv((3,)), nb_steps=10) 36 | 37 | 38 | def test_multi_ddpg_input(): 39 | nb_actions = 2 40 | 41 | actor_observation_input1 = Input(shape=(2, 3), name='actor_observation_input1') 42 | actor_observation_input2 = Input(shape=(2, 4), name='actor_observation_input2') 43 | actor = Sequential() 44 | x = merge([actor_observation_input1, actor_observation_input2], mode='concat') 45 | x = Flatten()(x) 46 | x = Dense(nb_actions)(x) 47 | actor = Model(input=[actor_observation_input1, actor_observation_input2], output=x) 48 | 49 | action_input = Input(shape=(nb_actions,), name='action_input') 50 | critic_observation_input1 = Input(shape=(2, 3), name='critic_observation_input1') 51 | critic_observation_input2 = Input(shape=(2, 4), name='critic_observation_input2') 52 | x = merge([critic_observation_input1, critic_observation_input2], mode='concat') 53 | x = merge([action_input, Flatten()(x)], mode='concat') 54 | x = Dense(1)(x) 55 | critic = Model(input=[action_input, critic_observation_input1, critic_observation_input2], output=x) 56 | 57 | processor = MultiInputProcessor(nb_inputs=2) 58 | memory = SequentialMemory(limit=10, window_length=2) 59 | agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=action_input, memory=memory, 60 | nb_actions=2, nb_steps_warmup_critic=5, nb_steps_warmup_actor=5, batch_size=4, 61 | processor=processor) 62 | agent.compile('sgd') 63 | agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10) 64 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: trusty 3 | language: python 4 | matrix: 5 | include: 6 | - python: 3.5 7 | env: KERAS_BACKEND=theano 8 | - python: 3.5 9 | env: KERAS_BACKEND=tensorflow 10 | - python: 2.7 11 | env: KERAS_BACKEND=theano 12 | - python: 2.7 13 | env: KERAS_BACKEND=tensorflow 14 | - python: 2.7 15 | env: KERAS_BACKEND=tensorflow LEGACY_KERAS=1 16 | - python: 2.7 17 | env: KERAS_BACKEND=tensorflow TEST_MODE=PEP8 18 | - python: 2.7 19 | env: KERAS_BACKEND=theano TEST_MODE=INTEGRATION 20 | - python: 3.5 21 | env: KERAS_BACKEND=theano TEST_MODE=INTEGRATION 22 | - python: 2.7 23 | env: KERAS_BACKEND=tensorflow TEST_MODE=INTEGRATION 24 | - python: 3.5 25 | env: KERAS_BACKEND=tensorflow TEST_MODE=INTEGRATION 26 | install: 27 | # Adopted from https://github.com/fchollet/keras/blob/master/.travis.yml. 28 | - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then 29 | wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; 30 | else 31 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 32 | fi 33 | - bash miniconda.sh -b -p $HOME/miniconda 34 | - export PATH="$HOME/miniconda/bin:$PATH" 35 | - hash -r 36 | - conda config --set always_yes yes --set changeps1 no 37 | - conda update -q conda 38 | # Useful for debugging any issues with conda 39 | - conda info -a 40 | 41 | - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy matplotlib pandas pytest h5py 42 | - source activate test-environment 43 | - pip install pytest-xdist 44 | # See https://github.com/pytest-dev/pytest-cov/issues/124 for details 45 | - pip install pytest-cov==2.2.1 python-coveralls coverage==3.7.1 46 | - pip install pep8 pytest-pep8 47 | - pip install tensorflow 48 | # Bleeding-edge: pip install git+https://github.com/Theano/Theano.git 49 | - pip install theano>=0.9.0rc1 50 | - pip install gym 51 | # Bleeding-edge: pip install git+https://github.com/fchollet/keras.git; 52 | - if [[ "$LEGACY_KERAS" == "1" ]]; then 53 | pip install keras==1.2.2; 54 | else 55 | pip install "keras<2.0.7"; 56 | fi 57 | 58 | - python setup.py install 59 | 60 | # command to run tests. 61 | script: 62 | # Run keras backend init to initialize backend config. 63 | - python -c "import keras.backend" 64 | # Set up keras backend 65 | - sed -i -e 's/"backend":[[:space:]]*"[^"]*/"backend":\ "'$KERAS_BACKEND'/g' ~/.keras/keras.json; 66 | - echo -e "Running tests with the following config:\n$(cat ~/.keras/keras.json)" 67 | - if [[ "$TEST_MODE" == "INTEGRATION" ]]; then 68 | PYTHONPATH=$PWD:$PYTHONPATH py.test tests/integration; 69 | elif [[ "$TEST_MODE" == "PEP8" ]]; then 70 | PYTHONPATH=$PWD:$PYTHONPATH py.test --pep8 -m pep8 -n0; 71 | else 72 | PYTHONPATH=$PWD:$PYTHONPATH py.test tests/; 73 | fi 74 | after_success: 75 | - coveralls 76 | -------------------------------------------------------------------------------- /examples/ddpg_mujoco.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import gym 4 | from gym import wrappers 5 | 6 | from keras.models import Sequential, Model 7 | from keras.layers import Dense, Activation, Flatten, Input 8 | from keras.optimizers import Adam 9 | 10 | from rl.processors import WhiteningNormalizerProcessor 11 | from rl.agents import DDPGAgent 12 | from rl.memory import SequentialMemory 13 | from rl.random import OrnsteinUhlenbeckProcess 14 | from rl.keras_future import concatenate 15 | 16 | 17 | class MujocoProcessor(WhiteningNormalizerProcessor): 18 | def process_action(self, action): 19 | return np.clip(action, -1., 1.) 20 | 21 | 22 | ENV_NAME = 'HalfCheetah-v1' 23 | gym.undo_logger_setup() 24 | 25 | 26 | # Get the environment and extract the number of actions. 27 | env = gym.make(ENV_NAME) 28 | env = wrappers.Monitor(env, '/tmp/{}'.format(ENV_NAME), force=True) 29 | np.random.seed(123) 30 | env.seed(123) 31 | assert len(env.action_space.shape) == 1 32 | nb_actions = env.action_space.shape[0] 33 | 34 | # Next, we build a very simple model. 35 | actor = Sequential() 36 | actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 37 | actor.add(Dense(400)) 38 | actor.add(Activation('relu')) 39 | actor.add(Dense(300)) 40 | actor.add(Activation('relu')) 41 | actor.add(Dense(nb_actions)) 42 | actor.add(Activation('tanh')) 43 | print(actor.summary()) 44 | 45 | action_input = Input(shape=(nb_actions,), name='action_input') 46 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') 47 | flattened_observation = Flatten()(observation_input) 48 | x = Dense(400)(flattened_observation) 49 | x = Activation('relu')(x) 50 | x = concatenate([x, action_input]) 51 | x = Dense(300)(x) 52 | x = Activation('relu')(x) 53 | x = Dense(1)(x) 54 | x = Activation('linear')(x) 55 | critic = Model(input=[action_input, observation_input], output=x) 56 | print(critic.summary()) 57 | 58 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 59 | # even the metrics! 60 | memory = SequentialMemory(limit=100000, window_length=1) 61 | random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1) 62 | agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, 63 | memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, 64 | random_process=random_process, gamma=.99, target_model_update=1e-3, 65 | processor=MujocoProcessor()) 66 | agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae']) 67 | 68 | # Okay, now it's time to learn something! We visualize the training here for show, but this 69 | # slows down training quite a lot. You can always safely abort the training prematurely using 70 | # Ctrl + C. 71 | agent.fit(env, nb_steps=1000000, visualize=False, verbose=1) 72 | 73 | # After training is done, we save the final weights. 74 | agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) 75 | 76 | # Finally, evaluate our algorithm for 5 episodes. 77 | agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200) 78 | -------------------------------------------------------------------------------- /tests/rl/test_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pytest 3 | import numpy as np 4 | from numpy.testing import assert_allclose 5 | 6 | from keras.models import Model, Sequential 7 | from keras.layers import Input, Dense, merge 8 | from keras.optimizers import SGD 9 | import keras.backend as K 10 | 11 | from rl.util import clone_optimizer, clone_model, huber_loss, WhiteningNormalizer 12 | 13 | 14 | def test_clone_sequential_model(): 15 | seq = Sequential() 16 | seq.add(Dense(8, input_shape=(3,))) 17 | seq.compile(optimizer='sgd', loss='mse') 18 | 19 | clone = clone_model(seq) 20 | clone.compile(optimizer='sgd', loss='mse') 21 | 22 | ins = np.random.random((4, 3)) 23 | y_pred_seq = seq.predict_on_batch(ins) 24 | y_pred_clone = clone.predict_on_batch(ins) 25 | assert y_pred_seq.shape == y_pred_clone.shape 26 | assert_allclose(y_pred_seq, y_pred_clone) 27 | 28 | 29 | def test_clone_graph_model(): 30 | in1 = Input(shape=(2,)) 31 | in2 = Input(shape=(3,)) 32 | x = Dense(8)(merge([in1, in2], mode='concat')) 33 | graph = Model([in1, in2], x) 34 | graph.compile(optimizer='sgd', loss='mse') 35 | 36 | clone = clone_model(graph) 37 | clone.compile(optimizer='sgd', loss='mse') 38 | 39 | ins = [np.random.random((4, 2)), np.random.random((4, 3))] 40 | y_pred_graph = graph.predict_on_batch(ins) 41 | y_pred_clone = clone.predict_on_batch(ins) 42 | assert y_pred_graph.shape == y_pred_clone.shape 43 | assert_allclose(y_pred_graph, y_pred_clone) 44 | 45 | 46 | def test_clone_optimizer(): 47 | lr, momentum, clipnorm, clipvalue = np.random.random(size=4) 48 | optimizer = SGD(lr=lr, momentum=momentum, clipnorm=clipnorm, clipvalue=clipvalue) 49 | clone = clone_optimizer(optimizer) 50 | 51 | assert isinstance(clone, SGD) 52 | assert K.get_value(optimizer.lr) == K.get_value(clone.lr) 53 | assert K.get_value(optimizer.momentum) == K.get_value(clone.momentum) 54 | assert optimizer.clipnorm == clone.clipnorm 55 | assert optimizer.clipvalue == clone.clipvalue 56 | 57 | 58 | def test_clone_optimizer_from_string(): 59 | clone = clone_optimizer('sgd') 60 | assert isinstance(clone, SGD) 61 | 62 | 63 | def test_huber_loss(): 64 | a = np.array([1., 1.5, 2., 4.]) 65 | b = np.array([1.5, 1., 4., 2.]) 66 | assert_allclose(K.eval(huber_loss(a, b, 1.)), np.array([.125, .125, 1.5, 1.5])) 67 | assert_allclose(K.eval(huber_loss(a, b, 3.)), np.array([.125, .125, 2., 2.])) 68 | assert_allclose(K.eval(huber_loss(a, b, np.inf)), np.array([.125, .125, 2., 2.])) 69 | 70 | 71 | def test_whitening_normalizer(): 72 | x = np.random.normal(loc=.2, scale=2., size=(1000, 5)) 73 | normalizer = WhiteningNormalizer(shape=(5,)) 74 | normalizer.update(x[:500]) 75 | normalizer.update(x[500:]) 76 | 77 | assert_allclose(normalizer.mean, np.mean(x, axis=0)) 78 | assert_allclose(normalizer.std, np.std(x, axis=0)) 79 | 80 | x_norm = normalizer.normalize(x) 81 | assert_allclose(np.mean(x_norm, axis=0), np.zeros(5, dtype=normalizer.dtype), atol=1e-5) 82 | assert_allclose(np.std(x_norm, axis=0), np.ones(5, dtype=normalizer.dtype), atol=1e-5) 83 | 84 | x_denorm = normalizer.denormalize(x_norm) 85 | assert_allclose(x_denorm, x) 86 | 87 | 88 | if __name__ == '__main__': 89 | pytest.main([__file__]) 90 | -------------------------------------------------------------------------------- /examples/naf_pendulum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Activation, Flatten, Input 6 | from keras.optimizers import Adam 7 | 8 | from rl.agents import NAFAgent 9 | from rl.memory import SequentialMemory 10 | from rl.random import OrnsteinUhlenbeckProcess 11 | from rl.core import Processor 12 | from rl.keras_future import concatenate, Model 13 | 14 | class PendulumProcessor(Processor): 15 | def process_reward(self, reward): 16 | # The magnitude of the reward can be important. Since each step yields a relatively 17 | # high reward, we reduce the magnitude by two orders. 18 | return reward / 100. 19 | 20 | 21 | ENV_NAME = 'Pendulum-v0' 22 | gym.undo_logger_setup() 23 | 24 | 25 | # Get the environment and extract the number of actions. 26 | env = gym.make(ENV_NAME) 27 | np.random.seed(123) 28 | env.seed(123) 29 | assert len(env.action_space.shape) == 1 30 | nb_actions = env.action_space.shape[0] 31 | 32 | # Build all necessary models: V, mu, and L networks. 33 | V_model = Sequential() 34 | V_model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 35 | V_model.add(Dense(16)) 36 | V_model.add(Activation('relu')) 37 | V_model.add(Dense(16)) 38 | V_model.add(Activation('relu')) 39 | V_model.add(Dense(16)) 40 | V_model.add(Activation('relu')) 41 | V_model.add(Dense(1)) 42 | V_model.add(Activation('linear')) 43 | print(V_model.summary()) 44 | 45 | mu_model = Sequential() 46 | mu_model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 47 | mu_model.add(Dense(16)) 48 | mu_model.add(Activation('relu')) 49 | mu_model.add(Dense(16)) 50 | mu_model.add(Activation('relu')) 51 | mu_model.add(Dense(16)) 52 | mu_model.add(Activation('relu')) 53 | mu_model.add(Dense(nb_actions)) 54 | mu_model.add(Activation('linear')) 55 | print(mu_model.summary()) 56 | 57 | action_input = Input(shape=(nb_actions,), name='action_input') 58 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') 59 | x = concatenate([action_input, Flatten()(observation_input)]) 60 | x = Dense(32)(x) 61 | x = Activation('relu')(x) 62 | x = Dense(32)(x) 63 | x = Activation('relu')(x) 64 | x = Dense(32)(x) 65 | x = Activation('relu')(x) 66 | x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x) 67 | x = Activation('linear')(x) 68 | L_model = Model(input=[action_input, observation_input], output=x) 69 | print(L_model.summary()) 70 | 71 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 72 | # even the metrics! 73 | processor = PendulumProcessor() 74 | memory = SequentialMemory(limit=100000, window_length=1) 75 | random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions) 76 | agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, 77 | memory=memory, nb_steps_warmup=100, random_process=random_process, 78 | gamma=.99, target_model_update=1e-3, processor=processor) 79 | agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) 80 | 81 | # Okay, now it's time to learn something! We visualize the training here for show, but this 82 | # slows down training quite a lot. You can always safely abort the training prematurely using 83 | # Ctrl + C. 84 | agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=200) 85 | 86 | # After training is done, we save the final weights. 87 | agent.save_weights('cdqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) 88 | 89 | # Finally, evaluate our algorithm for 5 episodes. 90 | agent.test(env, nb_episodes=10, visualize=True, nb_max_episode_steps=200) 91 | -------------------------------------------------------------------------------- /tests/integration/test_continuous.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import gym 5 | 6 | from keras.models import Sequential 7 | from keras.layers import Dense, Activation, Flatten, Input 8 | from keras.optimizers import Adam 9 | 10 | from rl.agents import NAFAgent, DDPGAgent 11 | from rl.random import OrnsteinUhlenbeckProcess 12 | from rl.memory import SequentialMemory 13 | from rl.keras_future import Model, concatenate 14 | 15 | 16 | def test_cdqn(): 17 | # TODO: replace this with a simpler environment where we can actually test if it finds a solution 18 | env = gym.make('Pendulum-v0') 19 | np.random.seed(123) 20 | env.seed(123) 21 | random.seed(123) 22 | nb_actions = env.action_space.shape[0] 23 | 24 | V_model = Sequential() 25 | V_model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 26 | V_model.add(Dense(16)) 27 | V_model.add(Activation('relu')) 28 | V_model.add(Dense(1)) 29 | 30 | mu_model = Sequential() 31 | mu_model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 32 | mu_model.add(Dense(16)) 33 | mu_model.add(Activation('relu')) 34 | mu_model.add(Dense(nb_actions)) 35 | 36 | action_input = Input(shape=(nb_actions,), name='action_input') 37 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') 38 | x = concatenate([action_input, Flatten()(observation_input)]) 39 | x = Dense(16)(x) 40 | x = Activation('relu')(x) 41 | x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x) 42 | L_model = Model(input=[action_input, observation_input], output=x) 43 | 44 | memory = SequentialMemory(limit=1000, window_length=1) 45 | random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=nb_actions) 46 | agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, 47 | memory=memory, nb_steps_warmup=50, random_process=random_process, 48 | gamma=.99, target_model_update=1e-3) 49 | agent.compile(Adam(lr=1e-3)) 50 | 51 | agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100) 52 | h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100) 53 | # TODO: evaluate history 54 | 55 | 56 | def test_ddpg(): 57 | # TODO: replace this with a simpler environment where we can actually test if it finds a solution 58 | env = gym.make('Pendulum-v0') 59 | np.random.seed(123) 60 | env.seed(123) 61 | random.seed(123) 62 | nb_actions = env.action_space.shape[0] 63 | 64 | actor = Sequential() 65 | actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 66 | actor.add(Dense(16)) 67 | actor.add(Activation('relu')) 68 | actor.add(Dense(nb_actions)) 69 | actor.add(Activation('linear')) 70 | 71 | action_input = Input(shape=(nb_actions,), name='action_input') 72 | observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') 73 | flattened_observation = Flatten()(observation_input) 74 | x = concatenate([action_input, flattened_observation]) 75 | x = Dense(16)(x) 76 | x = Activation('relu')(x) 77 | x = Dense(1)(x) 78 | x = Activation('linear')(x) 79 | critic = Model(input=[action_input, observation_input], output=x) 80 | 81 | memory = SequentialMemory(limit=1000, window_length=1) 82 | random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3) 83 | agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, 84 | memory=memory, nb_steps_warmup_critic=50, nb_steps_warmup_actor=50, 85 | random_process=random_process, gamma=.99, target_model_update=1e-3) 86 | agent.compile([Adam(lr=1e-3), Adam(lr=1e-3)]) 87 | 88 | agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100) 89 | h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100) 90 | # TODO: evaluate history 91 | -------------------------------------------------------------------------------- /rl/policy.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | 4 | from rl.util import * 5 | 6 | 7 | class Policy(object): 8 | def _set_agent(self, agent): 9 | self.agent = agent 10 | 11 | @property 12 | def metrics_names(self): 13 | return [] 14 | 15 | @property 16 | def metrics(self): 17 | return [] 18 | 19 | def select_action(self, **kwargs): 20 | raise NotImplementedError() 21 | 22 | def get_config(self): 23 | return {} 24 | 25 | 26 | class LinearAnnealedPolicy(Policy): 27 | def __init__(self, inner_policy, attr, value_max, value_min, value_test, nb_steps): 28 | if not hasattr(inner_policy, attr): 29 | raise ValueError('Policy "{}" does not have attribute "{}".'.format(attr)) 30 | 31 | super(LinearAnnealedPolicy, self).__init__() 32 | 33 | self.inner_policy = inner_policy 34 | self.attr = attr 35 | self.value_max = value_max 36 | self.value_min = value_min 37 | self.value_test = value_test 38 | self.nb_steps = nb_steps 39 | 40 | def get_current_value(self): 41 | if self.agent.training: 42 | # Linear annealed: f(x) = ax + b. 43 | a = -float(self.value_max - self.value_min) / float(self.nb_steps) 44 | b = float(self.value_max) 45 | value = max(self.value_min, a * float(self.agent.step) + b) 46 | else: 47 | value = self.value_test 48 | return value 49 | 50 | def select_action(self, **kwargs): 51 | setattr(self.inner_policy, self.attr, self.get_current_value()) 52 | return self.inner_policy.select_action(**kwargs) 53 | 54 | @property 55 | def metrics_names(self): 56 | return ['mean_{}'.format(self.attr)] 57 | 58 | @property 59 | def metrics(self): 60 | return [getattr(self.inner_policy, self.attr)] 61 | 62 | def get_config(self): 63 | config = super(LinearAnnealedPolicy, self).get_config() 64 | config['attr'] = self.attr 65 | config['value_max'] = self.value_max 66 | config['value_min'] = self.value_min 67 | config['value_test'] = self.value_test 68 | config['nb_steps'] = self.nb_steps 69 | config['inner_policy'] = get_object_config(self.inner_policy) 70 | return config 71 | 72 | 73 | class EpsGreedyQPolicy(Policy): 74 | def __init__(self, eps=.1): 75 | super(EpsGreedyQPolicy, self).__init__() 76 | self.eps = eps 77 | 78 | def select_action(self, q_values): 79 | assert q_values.ndim == 1 80 | nb_actions = q_values.shape[0] 81 | 82 | if np.random.uniform() < self.eps: 83 | action = np.random.random_integers(0, nb_actions-1) 84 | else: 85 | action = np.argmax(q_values) 86 | return action 87 | 88 | def get_config(self): 89 | config = super(EpsGreedyQPolicy, self).get_config() 90 | config['eps'] = self.eps 91 | return config 92 | 93 | 94 | class GreedyQPolicy(Policy): 95 | def select_action(self, q_values): 96 | assert q_values.ndim == 1 97 | action = np.argmax(q_values) 98 | return action 99 | 100 | 101 | class BoltzmannQPolicy(Policy): 102 | def __init__(self, tau=1., clip=(-500., 500.)): 103 | super(BoltzmannQPolicy, self).__init__() 104 | self.tau = tau 105 | self.clip = clip 106 | 107 | def select_action(self, q_values): 108 | assert q_values.ndim == 1 109 | q_values = q_values.astype('float64') 110 | nb_actions = q_values.shape[0] 111 | 112 | exp_values = np.exp(np.clip(q_values / self.tau, self.clip[0], self.clip[1])) 113 | probs = exp_values / np.sum(exp_values) 114 | action = np.random.choice(range(nb_actions), p=probs) 115 | return action 116 | 117 | def get_config(self): 118 | config = super(BoltzmannQPolicy, self).get_config() 119 | config['tau'] = self.tau 120 | config['clip'] = self.clip 121 | return config 122 | -------------------------------------------------------------------------------- /rl/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from keras.models import model_from_config, Sequential, Model, model_from_config 4 | import keras.optimizers as optimizers 5 | import keras.backend as K 6 | 7 | 8 | def clone_model(model, custom_objects={}): 9 | # Requires Keras 1.0.7 since get_config has breaking changes. 10 | config = { 11 | 'class_name': model.__class__.__name__, 12 | 'config': model.get_config(), 13 | } 14 | clone = model_from_config(config, custom_objects=custom_objects) 15 | clone.set_weights(model.get_weights()) 16 | return clone 17 | 18 | 19 | def clone_optimizer(optimizer): 20 | if type(optimizer) is str: 21 | return optimizers.get(optimizer) 22 | # Requires Keras 1.0.7 since get_config has breaking changes. 23 | params = dict([(k, v) for k, v in optimizer.get_config().items()]) 24 | config = { 25 | 'class_name': optimizer.__class__.__name__, 26 | 'config': params, 27 | } 28 | if hasattr(optimizers, 'optimizer_from_config'): 29 | # COMPATIBILITY: Keras < 2.0 30 | clone = optimizers.optimizer_from_config(config) 31 | else: 32 | clone = optimizers.deserialize(config) 33 | return clone 34 | 35 | 36 | def get_soft_target_model_updates(target, source, tau): 37 | target_weights = target.trainable_weights + sum([l.non_trainable_weights for l in target.layers], []) 38 | source_weights = source.trainable_weights + sum([l.non_trainable_weights for l in source.layers], []) 39 | assert len(target_weights) == len(source_weights) 40 | 41 | # Create updates. 42 | updates = [] 43 | for tw, sw in zip(target_weights, source_weights): 44 | updates.append((tw, tau * sw + (1. - tau) * tw)) 45 | return updates 46 | 47 | 48 | def get_object_config(o): 49 | if o is None: 50 | return None 51 | 52 | config = { 53 | 'class_name': o.__class__.__name__, 54 | 'config': o.get_config() 55 | } 56 | return config 57 | 58 | 59 | def huber_loss(y_true, y_pred, clip_value): 60 | # Huber loss, see https://en.wikipedia.org/wiki/Huber_loss and 61 | # https://medium.com/@karpathy/yes-you-should-understand-backprop-e2f06eab496b 62 | # for details. 63 | assert clip_value > 0. 64 | 65 | x = y_true - y_pred 66 | if np.isinf(clip_value): 67 | # Spacial case for infinity since Tensorflow does have problems 68 | # if we compare `K.abs(x) < np.inf`. 69 | return .5 * K.square(x) 70 | 71 | condition = K.abs(x) < clip_value 72 | squared_loss = .5 * K.square(x) 73 | linear_loss = clip_value * (K.abs(x) - .5 * clip_value) 74 | if K.backend() == 'tensorflow': 75 | import tensorflow as tf 76 | if hasattr(tf, 'select'): 77 | return tf.select(condition, squared_loss, linear_loss) # condition, true, false 78 | else: 79 | return tf.where(condition, squared_loss, linear_loss) # condition, true, false 80 | elif K.backend() == 'theano': 81 | from theano import tensor as T 82 | return T.switch(condition, squared_loss, linear_loss) 83 | else: 84 | raise RuntimeError('Unknown backend "{}".'.format(K.backend())) 85 | 86 | 87 | class AdditionalUpdatesOptimizer(optimizers.Optimizer): 88 | def __init__(self, optimizer, additional_updates): 89 | super(AdditionalUpdatesOptimizer, self).__init__() 90 | self.optimizer = optimizer 91 | self.additional_updates = additional_updates 92 | 93 | def get_updates(self, params, constraints, loss): 94 | updates = self.optimizer.get_updates(params, constraints, loss) 95 | updates += self.additional_updates 96 | self.updates = updates 97 | return self.updates 98 | 99 | def get_config(self): 100 | return self.optimizer.get_config() 101 | 102 | 103 | # Based on https://github.com/openai/baselines/blob/master/baselines/common/mpi_running_mean_std.py 104 | class WhiteningNormalizer(object): 105 | def __init__(self, shape, eps=1e-2, dtype=np.float64): 106 | self.eps = eps 107 | self.shape = shape 108 | self.dtype = dtype 109 | 110 | self._sum = np.zeros(shape, dtype=dtype) 111 | self._sumsq = np.zeros(shape, dtype=dtype) 112 | self._count = 0 113 | 114 | self.mean = np.zeros(shape, dtype=dtype) 115 | self.std = np.ones(shape, dtype=dtype) 116 | 117 | def normalize(self, x): 118 | return (x - self.mean) / self.std 119 | 120 | def denormalize(self, x): 121 | return self.std * x + self.mean 122 | 123 | def update(self, x): 124 | if x.ndim == len(self.shape): 125 | x = x.reshape(-1, *self.shape) 126 | assert x.shape[1:] == self.shape 127 | 128 | self._count += x.shape[0] 129 | self._sum += np.sum(x, axis=0) 130 | self._sumsq += np.sum(np.square(x), axis=0) 131 | 132 | self.mean = self._sum / float(self._count) 133 | self.std = np.sqrt(np.maximum(np.square(self.eps), self._sumsq / float(self._count) - np.square(self.mean))) 134 | -------------------------------------------------------------------------------- /tests/integration/test_discrete.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from numpy.testing import assert_allclose 5 | from gym.envs.debugging.two_round_deterministic_reward import TwoRoundDeterministicRewardEnv 6 | 7 | from keras.models import Sequential 8 | from keras.layers import Dense, Activation, Flatten 9 | from keras.optimizers import Adam 10 | from rl.agents import DQNAgent, CEMAgent, SARSAAgent 11 | from rl.policy import EpsGreedyQPolicy 12 | from rl.memory import SequentialMemory, EpisodeParameterMemory 13 | 14 | 15 | def test_dqn(): 16 | env = TwoRoundDeterministicRewardEnv() 17 | np.random.seed(123) 18 | env.seed(123) 19 | random.seed(123) 20 | nb_actions = env.action_space.n 21 | 22 | # Next, we build a very simple model. 23 | model = Sequential() 24 | model.add(Dense(16, input_shape=(1,))) 25 | model.add(Activation('relu')) 26 | model.add(Dense(nb_actions)) 27 | model.add(Activation('linear')) 28 | 29 | memory = SequentialMemory(limit=1000, window_length=1) 30 | policy = EpsGreedyQPolicy(eps=.1) 31 | dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, 32 | target_model_update=1e-1, policy=policy, enable_double_dqn=False) 33 | dqn.compile(Adam(lr=1e-3)) 34 | 35 | dqn.fit(env, nb_steps=2000, visualize=False, verbose=0) 36 | policy.eps = 0. 37 | h = dqn.test(env, nb_episodes=20, visualize=False) 38 | assert_allclose(np.mean(h.history['episode_reward']), 3.) 39 | 40 | 41 | def test_double_dqn(): 42 | env = TwoRoundDeterministicRewardEnv() 43 | np.random.seed(123) 44 | env.seed(123) 45 | random.seed(123) 46 | nb_actions = env.action_space.n 47 | 48 | # Next, we build a very simple model. 49 | model = Sequential() 50 | model.add(Dense(16, input_shape=(1,))) 51 | model.add(Activation('relu')) 52 | model.add(Dense(nb_actions)) 53 | model.add(Activation('linear')) 54 | 55 | memory = SequentialMemory(limit=1000, window_length=1) 56 | policy = EpsGreedyQPolicy(eps=.1) 57 | dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, 58 | target_model_update=1e-1, policy=policy, enable_double_dqn=True) 59 | dqn.compile(Adam(lr=1e-3)) 60 | 61 | dqn.fit(env, nb_steps=2000, visualize=False, verbose=0) 62 | policy.eps = 0. 63 | h = dqn.test(env, nb_episodes=20, visualize=False) 64 | assert_allclose(np.mean(h.history['episode_reward']), 3.) 65 | 66 | 67 | def test_cem(): 68 | env = TwoRoundDeterministicRewardEnv() 69 | np.random.seed(123) 70 | env.seed(123) 71 | random.seed(123) 72 | nb_actions = env.action_space.n 73 | 74 | # Next, we build a very simple model. 75 | model = Sequential() 76 | model.add(Dense(16, input_shape=(1,))) 77 | model.add(Activation('relu')) 78 | model.add(Dense(nb_actions)) 79 | model.add(Activation('linear')) 80 | 81 | memory = EpisodeParameterMemory(limit=1000, window_length=1) 82 | dqn = CEMAgent(model=model, nb_actions=nb_actions, memory=memory) 83 | dqn.compile() 84 | 85 | dqn.fit(env, nb_steps=2000, visualize=False, verbose=1) 86 | h = dqn.test(env, nb_episodes=20, visualize=False) 87 | assert_allclose(np.mean(h.history['episode_reward']), 3.) 88 | 89 | 90 | def test_duel_dqn(): 91 | env = TwoRoundDeterministicRewardEnv() 92 | np.random.seed(123) 93 | env.seed(123) 94 | random.seed(123) 95 | nb_actions = env.action_space.n 96 | 97 | # Next, we build a very simple model. 98 | model = Sequential() 99 | model.add(Dense(16, input_shape=(1,))) 100 | model.add(Activation('relu')) 101 | model.add(Dense(nb_actions, activation='linear')) 102 | 103 | memory = SequentialMemory(limit=1000, window_length=1) 104 | policy = EpsGreedyQPolicy(eps=.1) 105 | dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, 106 | target_model_update=1e-1, policy=policy, enable_double_dqn=False, enable_dueling_network=True) 107 | dqn.compile(Adam(lr=1e-3)) 108 | 109 | dqn.fit(env, nb_steps=2000, visualize=False, verbose=0) 110 | policy.eps = 0. 111 | h = dqn.test(env, nb_episodes=20, visualize=False) 112 | assert_allclose(np.mean(h.history['episode_reward']), 3.) 113 | 114 | 115 | def test_sarsa(): 116 | env = TwoRoundDeterministicRewardEnv() 117 | np.random.seed(123) 118 | env.seed(123) 119 | random.seed(123) 120 | nb_actions = env.action_space.n 121 | 122 | # Next, we build a very simple model. 123 | model = Sequential() 124 | model.add(Dense(16, input_shape=(1,))) 125 | model.add(Activation('relu')) 126 | model.add(Dense(nb_actions, activation='linear')) 127 | 128 | policy = EpsGreedyQPolicy(eps=.1) 129 | sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=50, policy=policy) 130 | sarsa.compile(Adam(lr=1e-3)) 131 | 132 | sarsa.fit(env, nb_steps=20000, visualize=False, verbose=0) 133 | policy.eps = 0. 134 | h = sarsa.test(env, nb_episodes=20, visualize=False) 135 | assert_allclose(np.mean(h.history['episode_reward']), 3.) 136 | -------------------------------------------------------------------------------- /examples/dqn_atari.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import argparse 3 | 4 | from PIL import Image 5 | import numpy as np 6 | import gym 7 | 8 | from keras.models import Sequential 9 | from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute 10 | from keras.optimizers import Adam 11 | import keras.backend as K 12 | 13 | from rl.agents.dqn import DQNAgent 14 | from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy 15 | from rl.memory import SequentialMemory 16 | from rl.core import Processor 17 | from rl.callbacks import FileLogger, ModelIntervalCheckpoint 18 | 19 | 20 | INPUT_SHAPE = (84, 84) 21 | WINDOW_LENGTH = 4 22 | 23 | 24 | class AtariProcessor(Processor): 25 | def process_observation(self, observation): 26 | assert observation.ndim == 3 # (height, width, channel) 27 | img = Image.fromarray(observation) 28 | img = img.resize(INPUT_SHAPE).convert('L') # resize and convert to grayscale 29 | processed_observation = np.array(img) 30 | assert processed_observation.shape == INPUT_SHAPE 31 | return processed_observation.astype('uint8') # saves storage in experience memory 32 | 33 | def process_state_batch(self, batch): 34 | # We could perform this processing step in `process_observation`. In this case, however, 35 | # we would need to store a `float32` array instead, which is 4x more memory intensive than 36 | # an `uint8` array. This matters if we store 1M observations. 37 | processed_batch = batch.astype('float32') / 255. 38 | return processed_batch 39 | 40 | def process_reward(self, reward): 41 | return np.clip(reward, -1., 1.) 42 | 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument('--mode', choices=['train', 'test'], default='train') 45 | parser.add_argument('--env-name', type=str, default='BreakoutDeterministic-v4') 46 | parser.add_argument('--weights', type=str, default=None) 47 | args = parser.parse_args() 48 | 49 | # Get the environment and extract the number of actions. 50 | env = gym.make(args.env_name) 51 | np.random.seed(123) 52 | env.seed(123) 53 | nb_actions = env.action_space.n 54 | 55 | # Next, we build our model. We use the same model that was described by Mnih et al. (2015). 56 | input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE 57 | model = Sequential() 58 | if K.image_dim_ordering() == 'tf': 59 | # (width, height, channels) 60 | model.add(Permute((2, 3, 1), input_shape=input_shape)) 61 | elif K.image_dim_ordering() == 'th': 62 | # (channels, width, height) 63 | model.add(Permute((1, 2, 3), input_shape=input_shape)) 64 | else: 65 | raise RuntimeError('Unknown image_dim_ordering.') 66 | model.add(Convolution2D(32, 8, 8, subsample=(4, 4))) 67 | model.add(Activation('relu')) 68 | model.add(Convolution2D(64, 4, 4, subsample=(2, 2))) 69 | model.add(Activation('relu')) 70 | model.add(Convolution2D(64, 3, 3, subsample=(1, 1))) 71 | model.add(Activation('relu')) 72 | model.add(Flatten()) 73 | model.add(Dense(512)) 74 | model.add(Activation('relu')) 75 | model.add(Dense(nb_actions)) 76 | model.add(Activation('linear')) 77 | print(model.summary()) 78 | 79 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 80 | # even the metrics! 81 | memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) 82 | processor = AtariProcessor() 83 | 84 | # Select a policy. We use eps-greedy action selection, which means that a random action is selected 85 | # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that 86 | # the agent initially explores the environment (high eps) and then gradually sticks to what it knows 87 | # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 88 | # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. 89 | policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, 90 | nb_steps=1000000) 91 | 92 | # The trade-off between exploration and exploitation is difficult and an on-going research topic. 93 | # If you want, you can experiment with the parameters or use a different policy. Another popular one 94 | # is Boltzmann-style exploration: 95 | # policy = BoltzmannQPolicy(tau=1.) 96 | # Feel free to give it a try! 97 | 98 | dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, 99 | processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, 100 | train_interval=4, delta_clip=1.) 101 | dqn.compile(Adam(lr=.00025), metrics=['mae']) 102 | 103 | if args.mode == 'train': 104 | # Okay, now it's time to learn something! We capture the interrupt exception so that training 105 | # can be prematurely aborted. Notice that you can the built-in Keras callbacks! 106 | weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) 107 | checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' 108 | log_filename = 'dqn_{}_log.json'.format(args.env_name) 109 | callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)] 110 | callbacks += [FileLogger(log_filename, interval=100)] 111 | dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000) 112 | 113 | # After training is done, we save the final weights one more time. 114 | dqn.save_weights(weights_filename, overwrite=True) 115 | 116 | # Finally, evaluate our algorithm for 10 episodes. 117 | dqn.test(env, nb_episodes=10, visualize=False) 118 | elif args.mode == 'test': 119 | weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) 120 | if args.weights: 121 | weights_filename = args.weights 122 | dqn.load_weights(weights_filename) 123 | dqn.test(env, nb_episodes=10, visualize=True) 124 | -------------------------------------------------------------------------------- /docs/sources/agents/overview.md: -------------------------------------------------------------------------------- 1 | ## Available Agents 2 | 3 | | Name | Implementation | Observation Space | Action Space | 4 | | ---------------------- |------------------------| -------------------| ---------------| 5 | | [DQN](/agents/dqn) | `rl.agents.DQNAgent` | discrete or continuous | discrete | 6 | | [DDPG](/agents/ddpg) | `rl.agents.DDPGAgent` | discrete or continuous | continuous | 7 | | [NAF](/agents/naf) | `rl.agents.NAFAgent` | discrete or continuous | continuous | 8 | | [CEM](/agents/cem) | `rl.agents.CEMAgent` | discrete or continuous | discrete | 9 | | [SARSA](/agents/sarsa) | `rl.agents.SARSAAgent` | discrete or continuous | discrete | 10 | 11 | --- 12 | 13 | ## Common API 14 | 15 | All agents share a common API. This allows you to easily switch between different agents. 16 | That being said, keep in mind that some agents make assumptions regarding the action space, i.e. assume discrete 17 | or continuous actions. 18 | 19 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L44) 20 | 21 | ### fit 22 | 23 | 24 | ```python 25 | fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None) 26 | ``` 27 | 28 | 29 | Trains the agent on the given environment. 30 | 31 | __Arguments__ 32 | 33 | - __env:__ (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. 34 | - __nb_steps__ (integer): Number of training steps to be performed. 35 | - __action_repetition__ (integer): Number of times the agent repeats the same action without 36 | observing the environment again. Setting this to a value > 1 can be useful 37 | if a single action only has a very small effect on the environment. 38 | - __callbacks__ (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): 39 | List of callbacks to apply during training. See [callbacks](/callbacks) for details. 40 | - __verbose__ (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging 41 | - __visualize__ (boolean): If `True`, the environment is visualized during training. However, 42 | this is likely going to slow down training significantly and is thus intended to be 43 | a debugging instrument. 44 | - __nb_max_start_steps__ (integer): Number of maximum steps that the agent performs at the beginning 45 | of each episode using `start_step_policy`. Notice that this is an upper limit since 46 | the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] 47 | at the beginning of each episode. 48 | - __start_step_policy__ (`lambda observation: action`): The policy 49 | to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. 50 | - __log_interval__ (integer): If `verbose` = 1, the number of steps that are considered to be an interval. 51 | - __nb_max_episode_steps__ (integer): Number of steps per episode that the agent performs before 52 | automatically resetting the environment. Set to `None` if each episode should run 53 | (potentially indefinitely) until the environment signals a terminal state. 54 | 55 | __Returns__ 56 | 57 | A `keras.callbacks.History` instance that recorded the entire training process. 58 | 59 | ---- 60 | 61 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L231) 62 | 63 | ### test 64 | 65 | 66 | ```python 67 | test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1) 68 | ``` 69 | 70 | 71 | Callback that is called before training begins." 72 | 73 | ---- 74 | 75 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L391) 76 | 77 | ### compile 78 | 79 | 80 | ```python 81 | compile(self, optimizer, metrics=[]) 82 | ``` 83 | 84 | 85 | Compiles an agent and the underlaying models to be used for training and testing. 86 | 87 | __Arguments__ 88 | 89 | - __optimizer__ (`keras.optimizers.Optimizer` instance): The optimizer to be used during training. 90 | - __metrics__ (list of functions `lambda y_true, y_pred: metric`): The metrics to run during training. 91 | 92 | ---- 93 | 94 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L39) 95 | 96 | ### get_config 97 | 98 | 99 | ```python 100 | get_config(self) 101 | ``` 102 | 103 | 104 | Configuration of the agent for serialization. 105 | 106 | ---- 107 | 108 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L364) 109 | 110 | ### reset_states 111 | 112 | 113 | ```python 114 | reset_states(self) 115 | ``` 116 | 117 | 118 | Resets all internally kept states after an episode is completed. 119 | 120 | ---- 121 | 122 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L400) 123 | 124 | ### load_weights 125 | 126 | 127 | ```python 128 | load_weights(self, filepath) 129 | ``` 130 | 131 | 132 | Loads the weights of an agent from an HDF5 file. 133 | 134 | __Arguments__ 135 | 136 | - __filepath__ (str): The path to the HDF5 file. 137 | 138 | ---- 139 | 140 | [[source]](https://github.com/matthiasplappert/keras-rl/blob/master/rl/core.py#L408) 141 | 142 | ### save_weights 143 | 144 | 145 | ```python 146 | save_weights(self, filepath, overwrite=False) 147 | ``` 148 | 149 | 150 | Saves the weights of an agent as an HDF5 file. 151 | 152 | __Arguments__ 153 | 154 | - __filepath__ (str): The path to where the weights should be saved. 155 | - __overwrite__ (boolean): If `False` and `filepath` already exists, raises an error. 156 | 157 | -------------------------------------------------------------------------------- /tests/rl/test_core.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pytest 3 | import numpy as np 4 | from numpy.testing import assert_allclose 5 | 6 | from rl.memory import SequentialMemory 7 | from rl.core import Agent, Env, Processor 8 | 9 | 10 | class TestEnv(Env): 11 | def __init__(self): 12 | super(TestEnv, self).__init__() 13 | 14 | def step(self, action): 15 | self.state += 1 16 | done = self.state >= 6 17 | reward = float(self.state) / 10. 18 | return np.array(self.state), reward, done, {} 19 | 20 | def reset(self): 21 | self.state = 1 22 | return np.array(self.state) 23 | 24 | def seed(self, seed=None): 25 | pass 26 | 27 | def configure(self, *args, **kwargs): 28 | pass 29 | 30 | 31 | class TestAgent(Agent): 32 | def __init__(self, memory, **kwargs): 33 | super(TestAgent, self).__init__(**kwargs) 34 | self.memory = memory 35 | 36 | def forward(self, observation): 37 | action = observation 38 | self.recent_action = action 39 | self.recent_observation = observation 40 | return action 41 | 42 | def backward(self, reward, terminal): 43 | metrics = [np.nan for _ in self.metrics_names] 44 | self.memory.append(self.recent_observation, self.recent_action, reward, terminal) 45 | return metrics 46 | 47 | def compile(self): 48 | self.compiled = True 49 | 50 | 51 | def test_fit_observations(): 52 | memory = SequentialMemory(100, window_length=2, ignore_episode_boundaries=False) 53 | agent = TestAgent(memory) 54 | env = TestEnv() 55 | agent.compile() 56 | agent.fit(env, 20, verbose=0) 57 | 58 | # Inspect memory to see if observations are correct. 59 | experiencies = memory.sample(batch_size=8, batch_idxs=range(8)) 60 | 61 | assert experiencies[0].reward == .2 62 | assert experiencies[0].action == 1 63 | assert_allclose(experiencies[0].state0, np.array([0, 1])) 64 | assert_allclose(experiencies[0].state1, np.array([1, 2])) 65 | assert experiencies[0].terminal1 is False 66 | 67 | assert experiencies[1].reward == .3 68 | assert experiencies[1].action == 2 69 | assert_allclose(experiencies[1].state0, np.array([1, 2])) 70 | assert_allclose(experiencies[1].state1, np.array([2, 3])) 71 | assert experiencies[1].terminal1 is False 72 | 73 | assert experiencies[2].reward == .4 74 | assert experiencies[2].action == 3 75 | assert_allclose(experiencies[2].state0, np.array([2, 3])) 76 | assert_allclose(experiencies[2].state1, np.array([3, 4])) 77 | assert experiencies[2].terminal1 is False 78 | 79 | assert experiencies[3].reward == .5 80 | assert experiencies[3].action == 4 81 | assert_allclose(experiencies[3].state0, np.array([3, 4])) 82 | assert_allclose(experiencies[3].state1, np.array([4, 5])) 83 | assert experiencies[3].terminal1 is False 84 | 85 | assert experiencies[4].reward == .6 86 | assert experiencies[4].action == 5 87 | assert_allclose(experiencies[4].state0, np.array([4, 5])) 88 | assert_allclose(experiencies[4].state1, np.array([5, 6])) 89 | assert experiencies[4].terminal1 is True 90 | 91 | # Experience 5 has been re-sampled since since state0 would be terminal in which case we 92 | # cannot really have a meaningful transition because the environment gets reset. We thus 93 | # just ensure that state0 is not terminal. 94 | assert not np.all(experiencies[5].state0 == np.array([5, 6])) 95 | 96 | assert experiencies[6].reward == .2 97 | assert experiencies[6].action == 1 98 | assert_allclose(experiencies[6].state0, np.array([0, 1])) 99 | assert_allclose(experiencies[6].state1, np.array([1, 2])) 100 | assert experiencies[6].terminal1 is False 101 | 102 | assert experiencies[7].reward == .3 103 | assert experiencies[7].action == 2 104 | assert_allclose(experiencies[7].state0, np.array([1, 2])) 105 | assert_allclose(experiencies[7].state1, np.array([2, 3])) 106 | assert experiencies[7].terminal1 is False 107 | 108 | 109 | def test_copy_observations(): 110 | methods = [ 111 | 'fit', 112 | 'test', 113 | ] 114 | 115 | for method in methods: 116 | original_observations = [] 117 | 118 | class LocalEnv(Env): 119 | def __init__(self): 120 | super(LocalEnv, self).__init__() 121 | 122 | def step(self, action): 123 | self.state += 1 124 | done = self.state >= 6 125 | reward = float(self.state) / 10. 126 | obs = np.array(self.state) 127 | original_observations.append(obs) 128 | return obs, reward, done, {} 129 | 130 | def reset(self): 131 | self.state = 1 132 | return np.array(self.state) 133 | 134 | def seed(self, seed=None): 135 | pass 136 | 137 | def configure(self, *args, **kwargs): 138 | pass 139 | 140 | # Slight abuse of the processor for test purposes. 141 | observations = [] 142 | 143 | class LocalProcessor(Processor): 144 | def process_step(self, observation, reward, done, info): 145 | observations.append(observation) 146 | return observation, reward, done, info 147 | 148 | processor = LocalProcessor() 149 | memory = SequentialMemory(100, window_length=1) 150 | agent = TestAgent(memory, processor=processor) 151 | env = LocalEnv() 152 | agent.compile() 153 | getattr(agent, method)(env, 20, verbose=0, visualize=False) 154 | 155 | assert len(observations) == len(original_observations) 156 | assert_allclose(np.array(observations), np.array(original_observations)) 157 | assert np.all([o is not o_ for o, o_ in zip(original_observations, observations)]) 158 | 159 | 160 | if __name__ == '__main__': 161 | pytest.main([__file__]) 162 | -------------------------------------------------------------------------------- /rl/agents/cem.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import deque 3 | from copy import deepcopy 4 | 5 | import numpy as np 6 | import keras.backend as K 7 | from keras.models import Model 8 | 9 | from rl.core import Agent 10 | from rl.util import * 11 | 12 | class CEMAgent(Agent): 13 | """Write me 14 | """ 15 | def __init__(self, model, nb_actions, memory, batch_size=50, nb_steps_warmup=1000, 16 | train_interval=50, elite_frac=0.05, memory_interval=1, theta_init=None, 17 | noise_decay_const=0.0, noise_ampl=0.0, **kwargs): 18 | super(CEMAgent, self).__init__(**kwargs) 19 | 20 | # Parameters. 21 | self.nb_actions = nb_actions 22 | self.batch_size = batch_size 23 | self.elite_frac = elite_frac 24 | self.num_best = int(self.batch_size * self.elite_frac) 25 | self.nb_steps_warmup = nb_steps_warmup 26 | self.train_interval = train_interval 27 | self.memory_interval = memory_interval 28 | 29 | # if using noisy CEM, the minimum standard deviation will be ampl * exp (- decay_const * step ) 30 | self.noise_decay_const = noise_decay_const 31 | self.noise_ampl = noise_ampl 32 | 33 | # default initial mean & cov, override this by passing an theta_init argument 34 | self.init_mean = 0.0 35 | self.init_stdev = 1.0 36 | 37 | # Related objects. 38 | self.memory = memory 39 | self.model = model 40 | self.shapes = [w.shape for w in model.get_weights()] 41 | self.sizes = [w.size for w in model.get_weights()] 42 | self.num_weights = sum(self.sizes) 43 | 44 | # store the best result seen during training, as a tuple (reward, flat_weights) 45 | self.best_seen = (-np.inf, np.zeros(self.num_weights)) 46 | 47 | self.theta = np.zeros(self.num_weights*2) 48 | self.update_theta(theta_init) 49 | 50 | # State. 51 | self.episode = 0 52 | self.compiled = False 53 | self.reset_states() 54 | 55 | def compile(self): 56 | self.model.compile(optimizer='sgd', loss='mse') 57 | self.compiled = True 58 | 59 | def load_weights(self, filepath): 60 | self.model.load_weights(filepath) 61 | 62 | def save_weights(self, filepath, overwrite=False): 63 | self.model.save_weights(filepath, overwrite=overwrite) 64 | 65 | def get_weights_flat(self,weights): 66 | weights_flat = np.zeros(self.num_weights) 67 | 68 | pos = 0 69 | for i_layer, size in enumerate(self.sizes): 70 | weights_flat[pos:pos+size] = weights[i_layer].flatten() 71 | pos += size 72 | return weights_flat 73 | 74 | def get_weights_list(self,weights_flat): 75 | weights = [] 76 | pos = 0 77 | for i_layer, size in enumerate(self.sizes): 78 | arr = weights_flat[pos:pos+size].reshape(self.shapes[i_layer]) 79 | weights.append(arr) 80 | pos += size 81 | return weights 82 | 83 | def reset_states(self): 84 | self.recent_observation = None 85 | self.recent_action = None 86 | 87 | def select_action(self, state, stochastic=False): 88 | batch = np.array([state]) 89 | if self.processor is not None: 90 | batch = self.processor.process_state_batch(batch) 91 | 92 | action = self.model.predict_on_batch(batch).flatten() 93 | if stochastic or self.training: 94 | return np.random.choice(np.arange(self.nb_actions), p=np.exp(action) / np.sum(np.exp(action))) 95 | return np.argmax(action) 96 | 97 | def update_theta(self,theta): 98 | if (theta is not None): 99 | assert theta.shape == self.theta.shape, "Invalid theta, shape is {0} but should be {1}".format(theta.shape,self.theta.shape) 100 | assert (not np.isnan(theta).any()), "Invalid theta, NaN encountered" 101 | assert (theta[self.num_weights:] >= 0.).all(), "Invalid theta, standard deviations must be nonnegative" 102 | self.theta = theta 103 | else: 104 | means = np.ones(self.num_weights) * self.init_mean 105 | stdevs = np.ones(self.num_weights) * self.init_stdev 106 | self.theta = np.hstack((means,stdevs)) 107 | 108 | def choose_weights(self): 109 | mean = self.theta[:self.num_weights] 110 | std = self.theta[self.num_weights:] 111 | weights_flat = std * np.random.randn(self.num_weights) + mean 112 | 113 | sampled_weights = self.get_weights_list(weights_flat) 114 | self.model.set_weights(sampled_weights) 115 | 116 | def forward(self, observation): 117 | # Select an action. 118 | state = self.memory.get_recent_state(observation) 119 | action = self.select_action(state) 120 | if self.processor is not None: 121 | action = self.processor.process_action(action) 122 | 123 | # Book-keeping. 124 | self.recent_observation = observation 125 | self.recent_action = action 126 | 127 | return action 128 | 129 | @property 130 | def layers(self): 131 | return self.model.layers[:] 132 | 133 | def backward(self, reward, terminal): 134 | # Store most recent experience in memory. 135 | if self.step % self.memory_interval == 0: 136 | self.memory.append(self.recent_observation, self.recent_action, reward, terminal, 137 | training=self.training) 138 | 139 | metrics = [np.nan for _ in self.metrics_names] 140 | if not self.training: 141 | # We're done here. No need to update the experience memory since we only use the working 142 | # memory to obtain the state over the most recent observations. 143 | return metrics 144 | 145 | if terminal: 146 | params = self.get_weights_flat(self.model.get_weights()) 147 | self.memory.finalize_episode(params) 148 | 149 | if self.step > self.nb_steps_warmup and self.episode % self.train_interval == 0: 150 | params, reward_totals = self.memory.sample(self.batch_size) 151 | best_idx = np.argsort(np.array(reward_totals))[-self.num_best:] 152 | best = np.vstack([params[i] for i in best_idx]) 153 | 154 | if reward_totals[best_idx[-1]] > self.best_seen[0]: 155 | self.best_seen = (reward_totals[best_idx[-1]], params[best_idx[-1]]) 156 | 157 | metrics = [np.mean(np.array(reward_totals)[best_idx])] 158 | if self.processor is not None: 159 | metrics += self.processor.metrics 160 | min_std = self.noise_ampl * np.exp(-self.step * self.noise_decay_const) 161 | 162 | mean = np.mean(best, axis=0) 163 | std = np.std(best, axis=0) + min_std 164 | new_theta = np.hstack((mean, std)) 165 | self.update_theta(new_theta) 166 | self.choose_weights() 167 | self.episode += 1 168 | return metrics 169 | 170 | def _on_train_end(self): 171 | self.model.set_weights(self.get_weights_list(self.best_seen[1])) 172 | 173 | @property 174 | def metrics_names(self): 175 | names = ['mean_best_reward'] 176 | if self.processor is not None: 177 | names += self.processor.metrics_names[:] 178 | return names 179 | -------------------------------------------------------------------------------- /tests/rl/agents/test_dqn.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import absolute_import 3 | 4 | import pytest 5 | import numpy as np 6 | from numpy.testing import assert_allclose 7 | 8 | from keras.models import Sequential 9 | from keras.layers import Input, merge, Dense, Flatten 10 | 11 | from rl.agents.dqn import NAFLayer, DQNAgent, NAFAgent 12 | from rl.memory import SequentialMemory 13 | from rl.processors import MultiInputProcessor 14 | from rl.keras_future import concatenate, Model 15 | 16 | from ..util import MultiInputTestEnv 17 | 18 | 19 | def test_single_dqn_input(): 20 | model = Sequential() 21 | model.add(Flatten(input_shape=(2, 3))) 22 | model.add(Dense(2)) 23 | 24 | memory = SequentialMemory(limit=10, window_length=2) 25 | for double_dqn in (True, False): 26 | agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, 27 | enable_double_dqn=double_dqn) 28 | agent.compile('sgd') 29 | agent.fit(MultiInputTestEnv((3,)), nb_steps=10) 30 | 31 | 32 | def test_multi_dqn_input(): 33 | input1 = Input(shape=(2, 3)) 34 | input2 = Input(shape=(2, 4)) 35 | x = merge([input1, input2], mode='concat') 36 | x = Flatten()(x) 37 | x = Dense(2)(x) 38 | model = Model(input=[input1, input2], output=x) 39 | 40 | memory = SequentialMemory(limit=10, window_length=2) 41 | processor = MultiInputProcessor(nb_inputs=2) 42 | for double_dqn in (True, False): 43 | agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, 44 | processor=processor, enable_double_dqn=double_dqn) 45 | agent.compile('sgd') 46 | agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10) 47 | 48 | 49 | def test_single_continuous_dqn_input(): 50 | nb_actions = 2 51 | 52 | V_model = Sequential() 53 | V_model.add(Flatten(input_shape=(2, 3))) 54 | V_model.add(Dense(1)) 55 | 56 | mu_model = Sequential() 57 | mu_model.add(Flatten(input_shape=(2, 3))) 58 | mu_model.add(Dense(nb_actions)) 59 | 60 | L_input = Input(shape=(2, 3)) 61 | L_input_action = Input(shape=(nb_actions,)) 62 | x = concatenate([Flatten()(L_input), L_input_action]) 63 | x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x) 64 | L_model = Model(input=[L_input_action, L_input], output=x) 65 | 66 | memory = SequentialMemory(limit=10, window_length=2) 67 | agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, 68 | memory=memory, nb_steps_warmup=5, batch_size=4) 69 | agent.compile('sgd') 70 | agent.fit(MultiInputTestEnv((3,)), nb_steps=10) 71 | 72 | 73 | def test_multi_continuous_dqn_input(): 74 | nb_actions = 2 75 | 76 | V_input1 = Input(shape=(2, 3)) 77 | V_input2 = Input(shape=(2, 4)) 78 | x = concatenate([V_input1, V_input2]) 79 | x = Flatten()(x) 80 | x = Dense(1)(x) 81 | V_model = Model(input=[V_input1, V_input2], output=x) 82 | 83 | mu_input1 = Input(shape=(2, 3)) 84 | mu_input2 = Input(shape=(2, 4)) 85 | x = concatenate([mu_input1, mu_input2]) 86 | x = Flatten()(x) 87 | x = Dense(nb_actions)(x) 88 | mu_model = Model(input=[mu_input1, mu_input2], output=x) 89 | 90 | L_input1 = Input(shape=(2, 3)) 91 | L_input2 = Input(shape=(2, 4)) 92 | L_input_action = Input(shape=(nb_actions,)) 93 | x = concatenate([L_input1, L_input2]) 94 | x = concatenate([Flatten()(x), L_input_action]) 95 | x = Dense(((nb_actions * nb_actions + nb_actions) // 2))(x) 96 | L_model = Model(input=[L_input_action, L_input1, L_input2], output=x) 97 | 98 | memory = SequentialMemory(limit=10, window_length=2) 99 | processor = MultiInputProcessor(nb_inputs=2) 100 | agent = NAFAgent(nb_actions=nb_actions, V_model=V_model, L_model=L_model, mu_model=mu_model, 101 | memory=memory, nb_steps_warmup=5, batch_size=4, processor=processor) 102 | agent.compile('sgd') 103 | agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10) 104 | 105 | 106 | def test_naf_layer_full(): 107 | batch_size = 2 108 | for nb_actions in (1, 3): 109 | # Construct single model with NAF as the only layer, hence it is fully deterministic 110 | # since no weights are used, which would be randomly initialized. 111 | L_flat_input = Input(shape=((nb_actions * nb_actions + nb_actions) // 2,)) 112 | mu_input = Input(shape=(nb_actions,)) 113 | action_input = Input(shape=(nb_actions,)) 114 | x = NAFLayer(nb_actions, mode='full')([L_flat_input, mu_input, action_input]) 115 | model = Model(input=[L_flat_input, mu_input, action_input], output=x) 116 | model.compile(loss='mse', optimizer='sgd') 117 | 118 | # Create random test data. 119 | L_flat = np.random.random((batch_size, (nb_actions * nb_actions + nb_actions) // 2)).astype('float32') 120 | mu = np.random.random((batch_size, nb_actions)).astype('float32') 121 | action = np.random.random((batch_size, nb_actions)).astype('float32') 122 | 123 | # Perform reference computations in numpy since these are much easier to verify. 124 | L = np.zeros((batch_size, nb_actions, nb_actions)).astype('float32') 125 | LT = np.copy(L) 126 | for l, l_T, l_flat in zip(L, LT, L_flat): 127 | l[np.tril_indices(nb_actions)] = l_flat 128 | l[np.diag_indices(nb_actions)] = np.exp(l[np.diag_indices(nb_actions)]) 129 | l_T[:, :] = l.T 130 | P = np.array([np.dot(l, l_T) for l, l_T in zip(L, LT)]).astype('float32') 131 | A_ref = np.array([np.dot(np.dot(a - m, p), a - m) for a, m, p in zip(action, mu, P)]).astype('float32') 132 | A_ref *= -.5 133 | 134 | # Finally, compute the output of the net, which should be identical to the previously 135 | # computed reference. 136 | A_net = model.predict([L_flat, mu, action]).flatten() 137 | assert_allclose(A_net, A_ref, rtol=1e-5) 138 | 139 | 140 | def test_naf_layer_diag(): 141 | batch_size = 2 142 | for nb_actions in (1, 3): 143 | # Construct single model with NAF as the only layer, hence it is fully deterministic 144 | # since no weights are used, which would be randomly initialized. 145 | L_flat_input = Input(shape=(nb_actions,)) 146 | mu_input = Input(shape=(nb_actions,)) 147 | action_input = Input(shape=(nb_actions,)) 148 | x = NAFLayer(nb_actions, mode='diag')([L_flat_input, mu_input, action_input]) 149 | model = Model(input=[L_flat_input, mu_input, action_input], output=x) 150 | model.compile(loss='mse', optimizer='sgd') 151 | 152 | # Create random test data. 153 | L_flat = np.random.random((batch_size, nb_actions)).astype('float32') 154 | mu = np.random.random((batch_size, nb_actions)).astype('float32') 155 | action = np.random.random((batch_size, nb_actions)).astype('float32') 156 | 157 | # Perform reference computations in numpy since these are much easier to verify. 158 | P = np.zeros((batch_size, nb_actions, nb_actions)).astype('float32') 159 | for p, l_flat in zip(P, L_flat): 160 | p[np.diag_indices(nb_actions)] = l_flat 161 | print(P, L_flat) 162 | A_ref = np.array([np.dot(np.dot(a - m, p), a - m) for a, m, p in zip(action, mu, P)]).astype('float32') 163 | A_ref *= -.5 164 | 165 | # Finally, compute the output of the net, which should be identical to the previously 166 | # computed reference. 167 | A_net = model.predict([L_flat, mu, action]).flatten() 168 | assert_allclose(A_net, A_ref, rtol=1e-5) 169 | 170 | 171 | if __name__ == '__main__': 172 | pytest.main([__file__]) 173 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Reinforcement Learning for Keras 2 | [](https://travis-ci.org/matthiasplappert/keras-rl) 3 | [](http://keras-rl.readthedocs.io/) 4 | [](https://github.com/matthiasplappert/keras-rl/blob/master/LICENSE) 5 | [](https://gitter.im/keras-rl/Lobby) 6 | 7 | 8 |
![]() |
11 | ![]() |
12 | ![]() |
13 |