├── .coveragerc ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── ci_scripts ├── install.sh ├── success.sh └── test.sh ├── deer ├── __init__.py ├── agent.py ├── base_classes │ ├── __init__.py │ ├── environment.py │ ├── learning_algo.py │ └── policy.py ├── default_parser.py ├── experiment │ ├── __init__.py │ └── base_controllers.py ├── helper │ ├── __init__.py │ └── tree.py ├── learning_algos │ ├── AC_net_keras.py │ ├── CRAR_keras.py │ ├── NN_CRAR_keras.py │ ├── NN_keras.py │ ├── NN_keras_LSTM.py │ ├── __init__.py │ └── q_net_keras.py ├── policies │ ├── EpsilonGreedyPolicy.py │ ├── LongerExplorationPolicy.py │ └── __init__.py └── tests │ ├── __init__.py │ └── test_base.py ├── docs ├── Makefile ├── conf.py ├── index.rst ├── modules │ ├── agents.rst │ ├── controllers.rst │ ├── environments.rst │ ├── learning-algorithms.rst │ └── policies.rst └── user │ ├── development.rst │ ├── environments.rst │ ├── environments │ ├── ALE.rst │ ├── PLE.rst │ ├── gym.rst │ ├── planning.rst │ ├── toy_env_time_series.rst │ └── two_storages.rst │ ├── installation.rst │ └── tutorial.rst ├── examples ├── ALE │ ├── ALE_env.py │ ├── ALE_env_gym.py │ └── run_ALE.py ├── MG_two_storages │ ├── MG_two_storages_env.py │ ├── data │ │ ├── BelgiumPV_prod_test.npy │ │ ├── BelgiumPV_prod_train.npy │ │ ├── example_nondeterminist_cons_test.npy │ │ ├── example_nondeterminist_cons_train.npy │ │ └── spotmarket_data_2007-2013.xls │ ├── plot_MG_operation.py │ └── run_MG_two_storages.py ├── gym │ ├── mountain_car_continuous_env.py │ ├── mountain_car_env.py │ ├── pendulum_env.py │ ├── run_mountain_car.py │ ├── run_mountain_car_continuous.py │ └── run_pendulum.py ├── maze │ ├── a_star_path_finding.py │ ├── maze_env.py │ └── run_maze.py ├── test_CRAR │ ├── catcher_env.py │ ├── run_catcher.py │ ├── run_simple_maze.py │ └── simple_maze_env.py └── toy_env │ ├── Toy_env.py │ ├── run_toy_env.py │ └── run_toy_env_simple.py ├── readthedocs.yml ├── requirements-docs.txt ├── requirements.txt └── setup.py /.coveragerc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/.coveragerc -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyproj 3 | /.vs/General_Deep_Q_RL/v14 4 | /General_Deep_Q_RL.sln 5 | /General_Deep_Q_RL/theano.py 6 | /General_Deep_Q_RL/plot.png 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: trusty 2 | language: python 3 | 4 | cache: 5 | apt: true 6 | # We use three different cache directory 7 | # to work around a Travis bug with multi-platform cache 8 | directories: 9 | - $HOME/.cache/pip 10 | - $HOME/download 11 | 12 | env: 13 | global: 14 | # Directory where tests are run from 15 | - TEST_DIR=/tmp/test_dir/ 16 | - MODULE=deer 17 | # - THEANO_VERSION="0.8" 18 | # - NUMPY_VERSION="1.10" 19 | # - SCIPY_VERSION="0.17" 20 | matrix: 21 | - PYTHON_VERSION="2.7" 22 | - PYTHON_VERSION="3.8" 23 | # - PYTHON_VERSION="3.8" EXAMPLE="toy_env" 24 | # - PYTHON_VERSION="3.8" EXAMPLE="mountain_car" 25 | 26 | install: source ci_scripts/install.sh 27 | script: bash ci_scripts/test.sh 28 | #after_success: source ci_scripts/success.sh 29 | 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Vincent Francois-Lavet, David Taralla 2 | All rights reserved. 3 | 4 | Inspired from "Human-level control through deep reinforcement learning", 5 | Nature, 518(7540):529-533, February 2015 and the implementation of Nathan 6 | Sprague (https://github.com/spragunr/deep_q_rl) 7 | 8 | This software is released under the 3-Clause BSD license. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are met: 12 | * Redistributions of source code must retain the above copyright 13 | notice, this list of conditions and the following disclaimer. 14 | * Redistributions in binary form must reproduce the above copyright 15 | notice, this list of conditions and the following disclaimer in the 16 | documentation and/or other materials provided with the distribution. 17 | * Neither the name of the nor the 18 | names of its contributors may be used to endorse or promote products 19 | derived from this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md LICENSE *.rst 2 | recursive-include docs *.rst 3 | recursive-include docs *.py 4 | prune docs/_build -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | |Python27|_ |Python36|_ |PyPi|_ |License|_ 4 | 5 | .. |Python27| image:: https://img.shields.io/badge/python-2.7-blue.svg 6 | .. _Python27: https://badge.fury.io/py/deer 7 | 8 | .. |Python36| image:: https://img.shields.io/badge/python-3.6-blue.svg 9 | .. _Python36: https://badge.fury.io/py/deer 10 | 11 | .. |PyPi| image:: https://badge.fury.io/py/deer.svg 12 | .. _PyPi: https://badge.fury.io/py/deer 13 | 14 | .. |License| image:: https://img.shields.io/badge/license-MIT-blue.svg 15 | .. _License: https://github.com/VinF/deer/blob/master/LICENSE 16 | 17 | DeeR 18 | ==== 19 | 20 | DeeR is a python library for Deep Reinforcement. It is build with modularity in mind so that it can easily be adapted to any need. It provides many possibilities out of the box such as Double Q-learning, prioritized Experience Replay, Deep deterministic policy gradient (DDPG), Combined Reinforcement via Abstract Representations (CRAR). Many different environment examples are also provided (some of them using OpenAI gym). 21 | 22 | Dependencies 23 | ============ 24 | 25 | This framework is tested to work under Python 3.6. 26 | 27 | The required dependencies are NumPy >= 1.10, joblib >= 0.9. You also need Keras>=2.6. 28 | 29 | For running the examples, Matplotlib >= 1.1.1 is required. 30 | For running the atari games environment, you need to install ALE >= 0.4. 31 | 32 | Full Documentation 33 | ================== 34 | 35 | The documentation is available at : http://deer.readthedocs.io/ 36 | -------------------------------------------------------------------------------- /ci_scripts/install.sh: -------------------------------------------------------------------------------- 1 | # inspired from scikit-learn contrib 2 | 3 | # Deactivate the travis-provided virtual environment and setup a 4 | # conda-based environment instead 5 | deactivate 6 | 7 | # Use the miniconda installer for faster download / install of conda 8 | # itself 9 | pushd . 10 | cd 11 | mkdir -p download 12 | cd download 13 | echo "Cached in $HOME/download :" 14 | ls -l 15 | echo 16 | if [[ ! -f miniconda.sh ]] 17 | then 18 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ #Miniconda3-4.5.4-Linux-x86_64.sh \ 19 | -O miniconda.sh 20 | fi 21 | chmod +x miniconda.sh && ./miniconda.sh -b 22 | cd .. 23 | ls /home/travis 24 | export PATH=/home/travis/miniconda/bin:$PATH 25 | export PATH=/home/travis/miniconda2/bin:$PATH 26 | export PATH=/home/travis/miniconda3/bin:$PATH 27 | conda update --yes conda 28 | popd 29 | 30 | # Configure the conda environment and put it in the path using the 31 | # provided versions 32 | conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ 33 | numpy 34 | 35 | #conda install libgcc -y 36 | source activate testenv 37 | pip install --upgrade pip 38 | pip install scipy 39 | pip install tensorflow-cpu 40 | pip install keras 41 | pip install matplotlib 42 | pip install joblib 43 | #pip install cython 44 | 45 | #if [[ "$PYTHON_VERSION" == "2.7" ]]; then 46 | # pip install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.1-cp27-none-linux_x86_64.whl # tensorflow 47 | #elif [[ "$PYTHON_VERSION" == "3.5" ]]; then 48 | # pip install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.1-cp35-cp35m-linux_x86_64.whl 49 | #fi 50 | 51 | if [[ "$COVERAGE" == "true" ]]; then 52 | pip install coverage coveralls 53 | fi 54 | 55 | python --version 56 | python -c "import numpy; print('numpy %s' % numpy.__version__)" 57 | python -c "import scipy; print('scipy %s' % scipy.__version__)" 58 | #python -c "import theano; print('theano %s' % theano.__version__)" 59 | python -c "import tensorflow; print('tensorflow %s' % tensorflow.__version__)" 60 | 61 | python setup.py develop 62 | -------------------------------------------------------------------------------- /ci_scripts/success.sh: -------------------------------------------------------------------------------- 1 | # inspired from scikit-learn contrib 2 | 3 | set -e 4 | 5 | if [[ "$COVERAGE" == "true" ]]; then 6 | # Need to run coveralls from a git checkout, so we copy .coverage 7 | # from TEST_DIR where nosetests has been run 8 | cp $TEST_DIR/.coverage $TRAVIS_BUILD_DIR 9 | cd $TRAVIS_BUILD_DIR 10 | # Ignore coveralls failures as the coveralls server is not 11 | # very reliable but we don't want travis to report a failure 12 | # in the github UI just because the coverage report failed to 13 | # be published. 14 | coveralls || echo "Coveralls upload failed" 15 | fi 16 | -------------------------------------------------------------------------------- /ci_scripts/test.sh: -------------------------------------------------------------------------------- 1 | # inspired from scikit-learn contrib 2 | 3 | set -e 4 | 5 | if [[ "$EXAMPLE" == "toy_env" ]]; then 6 | cd examples/toy_env 7 | python run_toy_env.py --epochs 5 8 | python run_toy_env_simple.py & sleep 30; kill $! 9 | 10 | elif [[ "$EXAMPLE" == "mountain_car" ]]; then 11 | pip install gym 12 | cd examples/gym 13 | python run_mountain_car.py --epochs 5 14 | 15 | # pip -V pip 16 | # python run_mountain_car_continuous.py --epochs 5 17 | 18 | else 19 | # Get into a temp directory to run test from the installed and 20 | # check if we do not leave artifacts 21 | mkdir -p $TEST_DIR 22 | 23 | cd $TEST_DIR 24 | 25 | if [[ "$COVERAGE" == "true" ]]; then 26 | nosetests -vs --with-coverage --cover-package=$MODULE $MODULE 27 | else 28 | nosetests -vs $MODULE 29 | fi 30 | 31 | fi 32 | -------------------------------------------------------------------------------- /deer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/deer/__init__.py -------------------------------------------------------------------------------- /deer/base_classes/__init__.py: -------------------------------------------------------------------------------- 1 | from .environment import Environment 2 | from .learning_algo import LearningAlgo 3 | from .policy import Policy -------------------------------------------------------------------------------- /deer/base_classes/environment.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module defines the base class for the environments. 3 | 4 | """ 5 | 6 | import numpy as np 7 | 8 | class Environment(object): 9 | """All your Environment classes should inherit this interface. 10 | 11 | The environment defines the dynamics and the reward signal that the agent observes when interacting with it. 12 | 13 | An agent sees at any time-step from the environment a collection of observable elements. Observing the environment 14 | at time t thus corresponds to obtaining a punctual observation for each of these elements. According to the control 15 | problem to solve, it might be useful for the agent to not only take action based on the current punctual observations 16 | but rather on a collection of the last punctual observations. In this framework, it's the environment that defines 17 | the number of each punctual observation to be considered. 18 | 19 | Different "modes" are used in this framework to allow the environment to have different dynamics and/or reward signal. 20 | For instance, in training mode, only a part of the dynamics may be available so that it is possible to see how well 21 | the agent generalizes to a slightly different one. 22 | """ 23 | 24 | def reset(self, mode): 25 | """Resets the environment and put it in mode [mode]. This function is called when beginning every new episode. 26 | 27 | The [mode] can be used to discriminate for instance between an agent which is training or trying to get a 28 | validation or generalization score. The mode the environment is in should always be redefined by resetting the 29 | environment using this method, meaning that the mode should be preserved until the next call to reset(). 30 | 31 | Parameters 32 | ----------- 33 | mode : int 34 | The mode to put the environment into. Mode "-1" is reserved and always means "training". 35 | 36 | Returns 37 | ------- 38 | Initialization of the pseudo state at the beginning of a new episode: list (of lists) with size given by inputDimensions 39 | """ 40 | 41 | raise NotImplementedError() 42 | 43 | def act(self, action): 44 | """Applies the agent action [action] on the environment. 45 | 46 | Parameters 47 | ----------- 48 | action : int 49 | The action selected by the agent to operate on the environment. Should be an identifier 50 | included between 0 included and nActions() excluded. 51 | """ 52 | 53 | raise NotImplementedError() 54 | 55 | def inputDimensions(self): 56 | """Gets the shape of the input space for this environment. 57 | 58 | This returns a list whose length is the number of observations in the environment. Each element of the list is a tuple: 59 | the first integer is always the history size considered for this observation and the rest describes the shape of the 60 | observation at a given time step. For instance: 61 | - () or (1,) means each observation at a given time step is a single scalar and the history size is 1 (= only current 62 | observation) 63 | - (N,) means each observation at a given time step is a single scalar and the history size is N 64 | - (N, M) means each observation at a given time step is a vector of length M and the history size is N 65 | - (N, M1, M2) means each observation at a given time step is a 2D matrix with M1 rows and M2 columns and the history 66 | size is N 67 | """ 68 | 69 | raise NotImplementedError() 70 | 71 | def nActions(self): 72 | """Gets the number of different actions that can be taken on this environment. 73 | It can be either an integer in the case of a finite discrete number of actions 74 | or it can be a list of couples [min_action_value,max_action_value] for a continuous action space""" 75 | 76 | raise NotImplementedError() 77 | 78 | def inTerminalState(self): 79 | """Tells whether the environment reached a terminal state after the last transition (i.e. the last transition 80 | that occured was terminal). 81 | 82 | As the majority of control tasks considered have no end (a continuous control should be operated), by default 83 | this returns always False. But in the context of a video game for instance, terminal states can happen and in 84 | these cases, this method should be overridden. 85 | 86 | Returns 87 | ------- 88 | isTerminal : bool 89 | Whether or not the current state is terminal 90 | """ 91 | 92 | return False 93 | 94 | def observe(self): 95 | """Gets a list of punctual observations composing this environment. 96 | 97 | This returns a list where element i is a punctual observation. Note that the history of observations is not 98 | returned and only the current observation is. 99 | 100 | See the documentation of inputDimensions() for more information about the shape of the observations. 101 | """ 102 | 103 | raise NotImplementedError() 104 | 105 | def summarizePerformance(self, test_data_set, *args, **kwargs): 106 | """Optional hook that can be used to show a summary of the performance of the agent on the 107 | environment in the current mode. 108 | 109 | Parameters 110 | ----------- 111 | test_data_set : agent.DataSet 112 | The dataset maintained by the agent in the current mode, which contains 113 | observations, actions taken and rewards obtained, as well as wether each transition was terminal or 114 | not. Refer to the documentation of agent.DataSet for more information. 115 | """ 116 | 117 | pass 118 | 119 | def observationType(self, subject): 120 | """Gets the most inner type (np.uint8, np.float32, ...) of [subject]. 121 | 122 | Parameters 123 | ----------- 124 | subject : int 125 | The subject 126 | """ 127 | 128 | return np.float32 129 | 130 | def end(self): 131 | """Optional hook called at the end of all epochs 132 | """ 133 | 134 | pass 135 | -------------------------------------------------------------------------------- /deer/base_classes/learning_algo.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module defines the base class for the learning algorithms. 3 | 4 | """ 5 | 6 | 7 | class LearningAlgo(object): 8 | """ All the Q-networks, actor-critic networks, etc. should inherit this interface. 9 | 10 | Parameters 11 | ----------- 12 | environment : object from class Environment 13 | The environment linked to the Q-network 14 | batch_size : int 15 | Number of tuples taken into account for each iteration of gradient descent 16 | """ 17 | def __init__(self, environment, batch_size): 18 | self._environment = environment 19 | self._df = 0.9 20 | self._lr = 0.005 21 | self._input_dimensions = self._environment.inputDimensions() 22 | self._n_actions = self._environment.nActions() 23 | self._batch_size = batch_size 24 | 25 | def train(self, states, actions, rewards, nextStates, terminals): 26 | """ This method performs the training step (e.g. using Bellman iteration in a deep Q-network) 27 | for one batch of tuples. 28 | """ 29 | raise NotImplementedError() 30 | 31 | def chooseBestAction(self, state): 32 | """ Get the best action for a pseudo-state 33 | """ 34 | raise NotImplementedError() 35 | 36 | def qValues(self, state): 37 | """ Get the q value for one pseudo-state 38 | """ 39 | raise NotImplementedError() 40 | 41 | def setLearningRate(self, lr): 42 | """ Setting the learning rate 43 | NB: The learning rate has usually to be set in the optimizer, hence this function should 44 | be overridden. Otherwise, the learning rate change is likely not to be taken into account 45 | 46 | Parameters 47 | ----------- 48 | lr : float 49 | The learning rate that has to bet set 50 | """ 51 | self._lr = lr 52 | 53 | def setDiscountFactor(self, df): 54 | """ Setting the discount factor 55 | 56 | Parameters 57 | ----------- 58 | df : float 59 | The discount factor that has to bet set 60 | """ 61 | if df < 0. or df > 1.: 62 | raise AgentError("The discount factor should be in [0,1]") 63 | 64 | self._df = df 65 | 66 | def learningRate(self): 67 | """ Getting the learning rate 68 | """ 69 | return self._lr 70 | 71 | def discountFactor(self): 72 | """ Getting the discount factor 73 | """ 74 | return self._df 75 | 76 | if __name__ == "__main__": 77 | pass 78 | -------------------------------------------------------------------------------- /deer/base_classes/policy.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module defines the base class for the policies. 3 | 4 | """ 5 | 6 | import numpy as np 7 | 8 | class Policy(object): 9 | """Abstract class for all policies. 10 | A policy takes observations as input, and outputs an action. 11 | 12 | Parameters 13 | ----------- 14 | learning_algo : object from class LearningALgo 15 | n_actions : int or list 16 | Definition of the action space provided by Environment.nActions() 17 | random_state : numpy random number generator 18 | """ 19 | 20 | def __init__(self, learning_algo, n_actions,random_state): 21 | self.learning_algo = learning_algo 22 | self.n_actions = n_actions 23 | self.random_state = random_state 24 | 25 | pass 26 | 27 | def bestAction(self, state, mode=None, *args, **kwargs): 28 | """ Returns the best Action for the given state. This is an additional encapsulation for q-network. 29 | """ 30 | action,V = self.learning_algo.chooseBestAction(state, mode, *args, **kwargs) 31 | return action, V 32 | 33 | def randomAction(self): 34 | """ Returns a random action 35 | """ 36 | if ( isinstance(self.n_actions,int)): 37 | # Discrete set of actions [0,nactions[ 38 | action = self.random_state.randint(0, self.n_actions) 39 | else: 40 | # Continuous set of actions 41 | action=[] 42 | for a in self.n_actions: 43 | action.append( self.random_state.uniform(a[0],a[1]) ) 44 | action=np.array(action) 45 | 46 | V = 0 47 | return action, V 48 | 49 | 50 | def action(self, state): 51 | """Main method of the Policy class. It can be called by agent.py, given a state, 52 | and should return a valid action w.r.t. the environment given to the constructor. 53 | """ 54 | raise NotImplementedError() 55 | -------------------------------------------------------------------------------- /deer/default_parser.py: -------------------------------------------------------------------------------- 1 | """This module contains a function to help parse command-line arguments. 2 | 3 | """ 4 | 5 | 6 | import argparse 7 | 8 | def process_args(args, defaults): 9 | """Handle the command line and return an object containing all the parameters. 10 | 11 | Arguments: 12 | args - list of command line arguments (not including executable name) 13 | defaults - a name space with variables corresponding to each of the required default command line values. 14 | """ 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('-e', '--epochs', dest="epochs", type=int, 18 | default=defaults.EPOCHS, 19 | help='Number of training epochs (default: %(default)s)') 20 | parser.add_argument('-s', '--steps-per-epoch', dest="steps_per_epoch", 21 | type=int, default=defaults.STEPS_PER_EPOCH, 22 | help='Number of steps per epoch (default: %(default)s)') 23 | parser.add_argument('-t', '--test-length', dest="steps_per_test", 24 | type=int, default=defaults.STEPS_PER_TEST, 25 | help='Number of steps per test (default: %(default)s)') 26 | parser.add_argument('-f', '--freq_summary_perfs', dest="period_btw_summary_perfs", 27 | type=int, default=defaults.PERIOD_BTW_SUMMARY_PERFS, 28 | help='freq summary perfs (default: %(default)s)') 29 | parser.add_argument('--frame-skip', dest="frame_skip", 30 | default=defaults.FRAME_SKIP, type=int, 31 | help='Every how many frames to process ' 32 | '(default: %(default)s)') 33 | parser.add_argument('--update-rule', dest="update_rule", 34 | type=str, default=defaults.UPDATE_RULE, 35 | help=('deepmind_rmsprop|rmsprop|sgd ' + 36 | '(default: %(default)s)')) 37 | parser.add_argument('--learning-rate', dest="learning_rate", 38 | type=float, default=defaults.LEARNING_RATE, 39 | help='Learning rate (default: %(default)s)') 40 | parser.add_argument('--learning-rate-decay', dest="learning_rate_decay", 41 | type=float, default=defaults.LEARNING_RATE_DECAY, 42 | help='Learning rate (default: %(default)s)') 43 | parser.add_argument('--rms-decay', dest="rms_decay", 44 | type=float, default=defaults.RMS_DECAY, 45 | help='Decay rate for rms_prop (default: %(default)s)') 46 | parser.add_argument('--rms-epsilon', dest="rms_epsilon", 47 | type=float, default=defaults.RMS_EPSILON, 48 | help='Denominator epsilson for rms_prop ' + 49 | '(default: %(default)s)') 50 | parser.add_argument('--momentum', type=float, default=defaults.MOMENTUM, 51 | help=('Momentum term for Nesterov momentum. '+ 52 | '(default: %(default)s)')) 53 | parser.add_argument('--clip-norm', dest="clip_norm", type=float, 54 | default=defaults.CLIP_NORM, 55 | help=('Max L2 norm for the gradient. ' + 56 | '(default: %(default)s)')) 57 | parser.add_argument('--discount', type=float, default=defaults.DISCOUNT, 58 | help='Discount rate init') 59 | parser.add_argument('--discount_inc', type=float, default=defaults.DISCOUNT_INC, 60 | help='Discount rate') 61 | parser.add_argument('--discount_max', type=float, default=defaults.DISCOUNT_MAX, 62 | help='Discount rate max') 63 | parser.add_argument('--epsilon-start', dest="epsilon_start", 64 | type=float, default=defaults.EPSILON_START, 65 | help=('Starting value for epsilon. ' + 66 | '(default: %(default)s)')) 67 | parser.add_argument('--epsilon-min', dest="epsilon_min", 68 | type=float, default=defaults.EPSILON_MIN, 69 | help='Minimum epsilon. (default: %(default)s)') 70 | parser.add_argument('--epsilon-decay', dest="epsilon_decay", 71 | type=float, default=defaults.EPSILON_DECAY, 72 | help=('Number of steps to minimum epsilon. ' + 73 | '(default: %(default)s)')) 74 | parser.add_argument('--max-history', dest="replay_memory_size", 75 | type=int, default=defaults.REPLAY_MEMORY_SIZE, 76 | help=('Maximum number of steps stored in replay ' + 77 | 'memory. (default: %(default)s)')) 78 | parser.add_argument('--batch-size', dest="batch_size", 79 | type=int, default=defaults.BATCH_SIZE, 80 | help='Batch size. (default: %(default)s)') 81 | parser.add_argument('--freeze-interval', dest="freeze_interval", 82 | type=int, default=defaults.FREEZE_INTERVAL, 83 | help=('Interval between target freezes. ' + 84 | '(default: %(default)s)')) 85 | parser.add_argument('--update-frequency', dest="update_frequency", 86 | type=int, default=defaults.UPDATE_FREQUENCY, 87 | help=('Number of actions before each SGD update. '+ 88 | '(default: %(default)s)')) 89 | parser.add_argument('--deterministic', dest='deterministic', action='store_true', 90 | help=('If fixed seed (default: %(default)s)')) 91 | parser.add_argument('--no-deterministic', dest='deterministic', action='store_false', 92 | help=('If no fixed seed')) 93 | parser.set_defaults(deterministic=defaults.DETERMINISTIC) 94 | parser.add_argument('--param1', dest="param1") # Additional parameter depending on the environment 95 | parser.add_argument('--param2', dest="param2") # Additional parameter depending on the environment 96 | parser.add_argument('--param3', dest="param3") # Additional parameter depending on the environment 97 | 98 | parameters = parser.parse_args(args) 99 | 100 | return parameters 101 | 102 | if __name__ == '__main__': 103 | pass 104 | -------------------------------------------------------------------------------- /deer/experiment/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/deer/experiment/__init__.py -------------------------------------------------------------------------------- /deer/helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/deer/helper/__init__.py -------------------------------------------------------------------------------- /deer/helper/tree.py: -------------------------------------------------------------------------------- 1 | """ Implementation of a binary tree for prioritized experience replay. 2 | Each leaf node is a past experience with its associated priority. 3 | Each parent node is the sum of the priorities of its children. 4 | The tree data structure serves purpose of efficient O(log(n)) priority 5 | update and random batch generation. 6 | 7 | One may check out Schaul et al. (2016) - Prioritized Experience Replay. 8 | 9 | Author: Aaron Zixiao Qiu 10 | """ 11 | 12 | import numpy as np 13 | 14 | class Node: 15 | def __init__(self, position=-1, priority=0, end=-1): 16 | """ The information contained in each node is: 17 | - Children and parent 18 | - Position: indice of the transition in the replay memory, i.e. 19 | the circular buffer used for storing the experiences 20 | - Priority: sum of the priorities of the children. If leaf node, 21 | then it is the priority of the transition. 22 | - End: variable used for tree search based on Position 23 | 24 | """ 25 | self.left = None 26 | self.right = None 27 | self.parent = None 28 | self.position = position 29 | self.priority = priority 30 | self.end = end 31 | 32 | def hasChildren(self): 33 | if (self.right == None and self.left == None): 34 | return False 35 | return True 36 | 37 | class SumTree: 38 | def __init__(self, size): 39 | """ The tree does not implement any insert-related method 40 | because the idea is to initialize the tree to have the same 41 | number of leaves as the size of the replay memory. 42 | """ 43 | 44 | self._root = Node() 45 | size_left = int(size/2) 46 | # Initialization of the tree 47 | self._root.left = self._createSubtree(self._root, 0, size_left) # [a,b[ 48 | self._root.right = self._createSubtree(self._root, size_left, size) 49 | self._max_priority = 1 50 | 51 | def _createSubtree(self, parent, begin, end): 52 | """ Build balanced subtrees. 53 | The leaf nodes have their "priority" initialized to 0 and 54 | "position" from 0 to n-1, with n being the size of the replay 55 | memory. 56 | The inner nodes are built while setting their "end" value that 57 | is used to position based search in the tree. 58 | 59 | Arguments: 60 | parent - parent node 61 | begin - lower bound of the range of positions 62 | end - upper bound (excluded) of the range of positions 63 | Return: 64 | node - root of the subtree 65 | """ 66 | n_elem = end - begin 67 | if (n_elem == 1): 68 | node = Node(position=begin) 69 | node.parent = parent 70 | node.end = end 71 | return node 72 | 73 | # At least 2 values (leaves) left 74 | mid = int((end + begin)/2) 75 | node = Node(end=end) 76 | node.parent = parent 77 | node.left = self._createSubtree(node, begin, mid) 78 | node.right = self._createSubtree(node, mid, end) 79 | return node 80 | 81 | def update(self, index, priority=-1): 82 | """ Update a leaf and the tree priorities. 83 | When the replay memory is updated with a new transition, it is 84 | also updated in the tree. The priority of the successive parent 85 | nodes are also modified. 86 | The function is also used to update the priority of an existing 87 | transtion after it has been replayed. 88 | 89 | Arguments: 90 | index - index of the leaf corresponding to the index of the 91 | new transition in the replay memory 92 | priority - the new priority of the leaf 93 | """ 94 | if (priority == -1): 95 | priority = self._max_priority 96 | elif (priority > self._max_priority): 97 | self._max_priority = priority 98 | 99 | # Search for index 100 | node = self.findIndex(index) 101 | 102 | # Replace with new priority 103 | diff = priority - node.priority 104 | node.priority = priority 105 | 106 | # Update value 107 | self._updateValue(node.parent, diff) 108 | 109 | def _updateValue(self, node, diff): 110 | node.priority += diff 111 | if (node.parent != None): 112 | self._updateValue(node.parent, diff) 113 | 114 | def findIndex(self, index): 115 | """ Find a leaf based on the index. 116 | 117 | Arguments: 118 | index - integer between 0 and n-1, n being the size of the 119 | replay memory 120 | Return: 121 | node - leaf with the index 122 | """ 123 | if(self._root != None): 124 | return self._findIndex(index, self._root) 125 | else: 126 | return None 127 | 128 | def _findIndex(self, index, node): 129 | if (node.position == index): 130 | return node 131 | 132 | if (index < node.left.end): 133 | return self._findIndex(index, node.left) 134 | else: 135 | return self._findIndex(index, node.right) 136 | 137 | def getBatch(self, n, rng, dataset): 138 | """ Generate the indices of a random batch of size n. 139 | The samples within the random batch are selected following 140 | the priorities (probabilities) of each transition in the replay 141 | memory. 142 | 143 | Argument: 144 | rng - number of elements in the random batch 145 | Return: 146 | indices - list with indices drawn w.r.t. the transition 147 | priorities. 148 | """ 149 | pmax = self._root.priority 150 | step = pmax / n 151 | indices = np.zeros(n, dtype='int32') 152 | for i in range(n): 153 | p = rng.uniform(i*step, (i+1)*step) 154 | node = self.find(p) 155 | index = self._checkTerminal(node.position, dataset) 156 | if (index >= 0): 157 | indices[i] = index 158 | else: 159 | return np.zeros(0) 160 | 161 | return indices 162 | 163 | def _checkTerminal(self, index, dataset): 164 | """ Avoid terminal states in the x samples preceding the chosen 165 | index. 166 | 167 | Argument: 168 | index - chosen index based on priority 169 | dataset - contains the circular buffers 170 | Return: 171 | index - checked or corrected value of the input index. 172 | """ 173 | history_size = dataset._max_history_size 174 | terminals = dataset._terminals 175 | n_elems = dataset.n_elems 176 | 177 | lower_bound = history_size - 1 178 | 179 | # Check if the index is valid wrt terminals 180 | first_try = index 181 | start_wrapped = False 182 | while True: 183 | i = index - 1 184 | processed = 0 185 | for _ in range(history_size - 1): 186 | if (i < 0 or terminals[i]): 187 | break; 188 | 189 | i -= 1 190 | processed += 1 191 | 192 | if (processed < history_size - 1): 193 | # if we stopped prematurely, shift slice to the left and try again 194 | index = i 195 | if (index < lower_bound): 196 | start_wrapped = True 197 | index = n_elems - 1 198 | if (start_wrapped and index <= first_try): 199 | return -1 200 | else: 201 | # else index was ok according to terminals 202 | return index 203 | 204 | def find(self, priority): 205 | """ Find a leaf based on the priority. 206 | 207 | Arguments: 208 | priority - the target priority generated randomly 209 | Return: 210 | node - the closest leaf node with a greater priority 211 | """ 212 | if(self._root != None): 213 | return self._find(priority, self._root) 214 | else: 215 | return None 216 | 217 | def _find(self, priority, node): 218 | if (not node.hasChildren()): 219 | return node 220 | 221 | if(priority <= node.left.priority): 222 | return self._find(priority, node.left) 223 | else: 224 | return self._find(priority - node.left.priority, node.right) 225 | 226 | def printTree(self): 227 | # Classical printout method. Mostly for debugging purposes. 228 | if(self._root != None): 229 | self._printTree(self._root) 230 | 231 | print("===============") 232 | 233 | def _printTree(self, node): 234 | if(node != None): 235 | self._printTree(node.left) 236 | print(node.position, node.priority) 237 | self._printTree(node.right) 238 | 239 | 240 | if __name__ == "__main__": 241 | t = SumTree(10) 242 | t.update(1, 1) 243 | t.update(2, 0.2) 244 | t.update(3, 3.3) 245 | t.update(4, 2.5) 246 | t.update(6, 2) 247 | t.printTree() 248 | 249 | rng = np.random.RandomState() 250 | for _ in range(10): 251 | print(t.getBatch(10, rng)) 252 | 253 | 254 | -------------------------------------------------------------------------------- /deer/learning_algos/AC_net_keras.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for the actor-critic "DDPG" (https://arxiv.org/abs/1509.02971) 3 | 4 | """ 5 | 6 | import numpy as np 7 | from ..base_classes import LearningAlgo as ACNetwork 8 | from .NN_keras import NN # Default Neural network used 9 | from tensorflow.keras.optimizers import SGD,RMSprop 10 | from tensorflow.keras import backend as K 11 | 12 | try: 13 | import tensorflow as tf 14 | assert(K.backend()=="tensorflow") 15 | except: 16 | print('Error : Currently only Tensorflow is supported as a backend for AC_net_keras. You can make the switch in the file ~/.keras/keras.json') 17 | 18 | class MyACNetwork(ACNetwork): 19 | """ 20 | Actor-critic learning (using Keras) with Deep Deterministic Policy Gradient (DDPG) for the continuous action domain 21 | 22 | Parameters 23 | ----------- 24 | environment : object from class Environment 25 | The environment in which the agent evolves. 26 | rho : float 27 | Parameter for rmsprop. Default : 0.9 28 | rms_epsilon : float 29 | Parameter for rmsprop. Default : 0.0001 30 | momentum : float 31 | Momentum for SGD. Default : 0 32 | clip_norm : float 33 | The gradient tensor will be clipped to a maximum L2 norm given by this value. 34 | freeze_interval : int 35 | Period during which the target network is freezed and after which the target network is updated. Default : 1000 36 | batch_size : int 37 | Number of tuples taken into account for each iteration of gradient descent. Default : 32 38 | update_rule: str 39 | {sgd,rmsprop}. Default : rmsprop 40 | random_state : numpy random number generator 41 | Set the random seed. 42 | double_Q : bool, optional 43 | Activate or not the double_Q learning. 44 | More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning. 45 | neural_network_critic : object, optional 46 | default is deer.learning_algos.NN_keras 47 | neural_network_actor : object, optional 48 | default is deer.learning_algos.NN_keras 49 | """ 50 | 51 | def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_norm=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network_critic=NN, neural_network_actor=NN): 52 | """ Initialize environment 53 | 54 | """ 55 | ACNetwork.__init__(self,environment, batch_size) 56 | 57 | self._rho = rho 58 | self._rms_epsilon = rms_epsilon 59 | self._momentum = momentum 60 | self._clip_norm = clip_norm 61 | self._freeze_interval = freeze_interval 62 | self._double_Q = double_Q 63 | self._random_state = random_state 64 | self._nActions=environment.nActions() 65 | self.update_counter = 0 66 | 67 | # self.sess = tf.Session() 68 | # K.set_session(self.sess) 69 | 70 | Q_net = neural_network_critic(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, True) 71 | 72 | self.q_vals, self.params, self.inputsQ = Q_net._buildDQN() 73 | 74 | if (update_rule=="sgd"): 75 | optimizer = SGD(lr=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm) 76 | elif (update_rule=="rmsprop"): 77 | optimizer = RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm) 78 | else: 79 | raise Exception('The update_rule '+update_rule+ 'is not implemented.') 80 | 81 | self.q_vals.compile(optimizer=optimizer, loss='mse') 82 | 83 | self.next_q_vals, self.next_params, self.next_inputsQ = Q_net._buildDQN() 84 | self.next_q_vals.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.q_vals 85 | 86 | self._resetQHat() 87 | 88 | 89 | policy_net = neural_network_actor(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, False) 90 | self.policy, self.params_policy = policy_net._buildDQN() 91 | self.policy.compile(optimizer=optimizer, loss='mse') 92 | self.next_policy, self.next_params_policy = policy_net._buildDQN() 93 | self.next_policy.compile(optimizer=optimizer, loss='mse') 94 | 95 | 96 | 97 | ### self.policy 98 | #self.action_grads = tf.gradients(self.q_vals.output,self.inputsQ[-1]) #GRADIENTS for policy update 99 | 100 | 101 | #self.sess.run(tf.initialize_all_variables()) 102 | 103 | 104 | def getAllParams(self): 105 | """ Get all parameters used by the learning algorithm 106 | 107 | Returns 108 | ------- 109 | Values of the parameters: list of numpy arrays 110 | """ 111 | params_value=[] 112 | for i,p in enumerate(self.params): 113 | params_value.append(K.get_value(p)) 114 | for i,p in enumerate(self.params_policy): 115 | params_value.append(K.get_value(p)) 116 | 117 | return params_value 118 | 119 | def setAllParams(self, list_of_values): 120 | """ Set all parameters used by the learning algorithm 121 | 122 | Arguments 123 | --------- 124 | list_of_values : list of numpy arrays 125 | list of the parameters to be set (same order than given by getAllParams()). 126 | """ 127 | for i,p in enumerate(self.params): 128 | K.set_value(p,list_of_values[i]) 129 | for j,p in enumerate(self.params_policy): 130 | K.set_value(p,list_of_values[j+i+1]) 131 | 132 | def train(self, states_val, actions_val, rewards_val, next_states_val, terminals_val): 133 | """ 134 | Train the actor-critic algorithm from one batch of data. 135 | 136 | Parameters 137 | ----------- 138 | states_val : numpy array of objects 139 | Each object is a numpy array that relates to one of the observations 140 | with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]). 141 | actions_val : numpy array of objects with size [self._batch_size]. 142 | Each object is a numpy array of floats with size [len(self._nActions)] 143 | actions[i] is the action taken after having observed states[:][i]. 144 | rewards_val : numpy array of floats with size [self._batch_size] 145 | rewards[i] is the reward obtained for taking actions[i-1]. 146 | next_states_val : numpy array of objects 147 | Each object is a numpy array that relates to one of the observations 148 | with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]). 149 | terminals_val : numpy array of booleans with size [self._batch_size] 150 | terminals[i] is True if the transition leads to a terminal state and False otherwise 151 | 152 | 153 | Returns 154 | ------- 155 | Average loss of the batch training 156 | Individual losses for each tuple 157 | """ 158 | if self.update_counter % self._freeze_interval == 0: 159 | self._resetQHat() 160 | 161 | 162 | ### Tain self.q_vals 163 | next_actions_val=self.next_policy.predict(next_states_val.tolist()) 164 | 165 | ns_list=next_states_val.tolist() 166 | ns_list.append( next_actions_val ) 167 | next_q_vals = self.next_q_vals.predict( ns_list ) 168 | 169 | not_terminals=np.invert(terminals_val).astype(float) 170 | 171 | target = rewards_val + not_terminals * self._df * next_q_vals.reshape((-1)) 172 | 173 | s_list=states_val.tolist() 174 | s_list.append( np.array(actions_val.tolist()) ) 175 | 176 | # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff 177 | q_vals=self.q_vals.predict( s_list ).reshape((-1)) 178 | diff_q = - q_vals + target 179 | loss_ind_q=pow(diff_q,2) 180 | 181 | loss_q=self.q_vals.train_on_batch( s_list , target ) 182 | 183 | 184 | ### Train self.policy 185 | cur_action=self.policy.predict(states_val.tolist()) 186 | cur_action=self.clip_action(cur_action) 187 | gg=self.gradients(states_val.tolist(),cur_action) 188 | 189 | target_action=self.clip_action(cur_action+gg) 190 | 191 | # Calculation of the individual losses for the policy network 192 | diff_policy = - cur_action + target_action 193 | loss_ind_policy=np.sum(pow(diff_policy,2),axis=-1) 194 | 195 | loss_policy=self.policy.train_on_batch(states_val.tolist(), target_action) 196 | 197 | self.update_counter += 1 198 | 199 | 200 | return loss_q+loss_policy,loss_ind_q+loss_ind_policy 201 | 202 | 203 | def clip_action(self, action): 204 | """ 205 | Clip the possible actions if it is outside the action space defined by self._nActions 206 | self._nActions is given as [[low_action1,high_action1],[low_action2,high_action2], ...] 207 | """ 208 | return np.clip(action,np.array(self._nActions)[:,0],np.array(self._nActions)[:,1]) 209 | 210 | 211 | def gradients(self, states, actions): 212 | """ 213 | Returns the gradients on the Q-network for the different actions (used for policy update) 214 | """ 215 | # combine state features with action 216 | input_list = states.copy() 217 | input_list.append(actions) 218 | 219 | # inputs need to be tf.Variable to calculate gradients 220 | input_list = [tf.Variable(input, dtype=tf.float32) for input in input_list] 221 | 222 | with tf.GradientTape() as tape: 223 | q_vals = self.q_vals(input_list) 224 | 225 | grads = tape.gradient(q_vals, input_list) 226 | 227 | #last entry in grads corresponds to the gradients of the q_vals with respect to the action 228 | out = grads[-1].numpy() 229 | 230 | return out 231 | 232 | def chooseBestAction(self, state, *args, **kwargs): 233 | """ Get the best action for a pseudo-state 234 | 235 | Arguments 236 | --------- 237 | state : one pseudo-state 238 | 239 | Returns 240 | ------- 241 | best_action : float 242 | estim_value : float 243 | """ 244 | 245 | best_action=self.policy.predict([np.expand_dims(s,axis=0) for s in state]) 246 | best_action=self.clip_action(best_action) 247 | 248 | the_list=[np.expand_dims(s,axis=0) for s in state] 249 | the_list.append( best_action ) 250 | estim_value=(self.q_vals.predict(the_list)[0,0]) 251 | 252 | return best_action[0],estim_value 253 | 254 | def _resetQHat(self): 255 | for i,(param,next_param) in enumerate(zip(self.params, self.next_params)): 256 | K.set_value(next_param,K.get_value(param)) 257 | -------------------------------------------------------------------------------- /deer/learning_algos/NN_keras.py: -------------------------------------------------------------------------------- 1 | """ 2 | Neural network using Keras (called by q_net_keras) 3 | 4 | """ 5 | 6 | import numpy as np 7 | from tensorflow.keras.models import Model 8 | from tensorflow.keras.layers import Input, Layer, Dense, Flatten, concatenate, Activation, Conv2D, MaxPooling2D, Reshape, Permute 9 | 10 | class NN(): 11 | """ 12 | Deep Q-learning network using Keras 13 | 14 | Parameters 15 | ----------- 16 | batch_size : int 17 | Number of tuples taken into account for each iteration of gradient descent 18 | input_dimensions : 19 | n_actions : 20 | random_state : numpy random number generator 21 | Set the random seed. 22 | action_as_input : Boolean 23 | Whether the action is given as input or as output 24 | """ 25 | def __init__(self, batch_size, input_dimensions, n_actions, random_state, action_as_input=False): 26 | self._input_dimensions=input_dimensions 27 | self._batch_size=batch_size 28 | self._random_state=random_state 29 | self._n_actions=n_actions 30 | self._action_as_input=action_as_input 31 | 32 | def _buildDQN(self): 33 | """ 34 | Build a network consistent with each type of inputs 35 | """ 36 | layers=[] 37 | outs_conv=[] 38 | inputs=[] 39 | 40 | for i, dim in enumerate(self._input_dimensions): 41 | # - observation[i] is a FRAME 42 | if len(dim) == 3 or len(dim) == 4: 43 | if(len(dim) == 4): 44 | input = Input(shape=(dim[-4],dim[-3],dim[-2],dim[-1])) 45 | inputs.append(input) 46 | input = Reshape((dim[-4]*dim[-3],dim[-2],dim[-1]), input_shape=(dim[-4],dim[-3],dim[-2],dim[-1]))(input) 47 | x=Permute((2,3,1), input_shape=(dim[-4]*dim[-3],dim[-2],dim[-1]))(input) #data_format='channels_last' 48 | else: 49 | input = Input(shape=(dim[-3],dim[-2],dim[-1])) 50 | inputs.append(input) 51 | x=Permute((2,3,1), input_shape=(dim[-3],dim[-2],dim[-1]))(input) #data_format='channels_last' 52 | x = Conv2D(8, (4, 4), activation='relu', padding='valid')(x) #Conv on the frames 53 | x = Conv2D(16, (3, 3), activation='relu', padding='valid')(x) #Conv on the frames 54 | x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='valid')(x) 55 | x = Conv2D(16, (3, 3), activation='relu', padding='valid')(x) #Conv on the frames 56 | 57 | out = Flatten()(x) 58 | 59 | # - observation[i] is a VECTOR 60 | elif len(dim) == 2: 61 | if dim[0] > 3: 62 | input = Input(shape=(dim[0],dim[1])) 63 | inputs.append(input) 64 | reshaped=Reshape((dim[0],dim[1],1), input_shape=(dim[0],dim[1]))(input) 65 | x = Conv2D(16, (2, 1), activation='relu', padding='valid')(reshaped)#Conv on the history 66 | x = Conv2D(16, (2, 1), activation='relu', padding='valid')(x) #Conv on the history & features 67 | 68 | out = Flatten()(x) 69 | else: 70 | input = Input(shape=(dim[0],dim[1])) 71 | inputs.append(input) 72 | out = Flatten()(input) 73 | 74 | # - observation[i] is a SCALAR - 75 | else: 76 | if dim[0] > 3: 77 | # this returns a tensor 78 | input = Input(shape=(dim[0],)) 79 | inputs.append(input) 80 | reshaped=Reshape((1,dim[0],1), input_shape=(dim[0],))(input) 81 | x = Conv2D(8, (1,2), activation='relu', padding='valid')(reshaped) #Conv on the history 82 | x = Conv2D(8, (1,2), activation='relu', padding='valid')(x) #Conv on the history 83 | 84 | out = Flatten()(x) 85 | 86 | else: 87 | input = Input(shape=(dim[0],)) 88 | inputs.append(input) 89 | out=input 90 | 91 | outs_conv.append(out) 92 | 93 | if (self._action_as_input==True): 94 | if ( isinstance(self._n_actions,int)): 95 | print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN") 96 | else: 97 | input = Input(shape=(len(self._n_actions),)) 98 | inputs.append(input) 99 | outs_conv.append(input) 100 | 101 | if len(outs_conv)>1: 102 | x = concatenate(outs_conv) 103 | else: 104 | x= outs_conv [0] 105 | 106 | # we stack a deep fully-connected network on top 107 | x = Dense(50, activation='relu')(x) 108 | x = Dense(20, activation='relu')(x) 109 | 110 | if (self._action_as_input==False): 111 | if ( isinstance(self._n_actions,int)): 112 | out = Dense(self._n_actions)(x) 113 | else: 114 | out = Dense(len(self._n_actions))(x) 115 | else: 116 | out = Dense(1)(x) 117 | 118 | model = Model(inputs=inputs, outputs=out) 119 | layers=model.layers 120 | 121 | # Grab all the parameters together. 122 | params = [ param 123 | for layer in layers 124 | for param in layer.trainable_weights ] 125 | 126 | if (self._action_as_input==True): 127 | return model, params, inputs 128 | else: 129 | return model, params 130 | 131 | if __name__ == '__main__': 132 | pass 133 | 134 | -------------------------------------------------------------------------------- /deer/learning_algos/NN_keras_LSTM.py: -------------------------------------------------------------------------------- 1 | """ 2 | Neural network with LSTM's using Keras (called by q_net_keras) 3 | 4 | """ 5 | 6 | import numpy as np 7 | from tensorflow.keras.models import Model 8 | from tensorflow.keras.layers import Input, Layer, Dense, Flatten, concatenate, Activation, Convolution2D, MaxPooling2D, Reshape 9 | from tensorflow.keras.layers.recurrent import LSTM 10 | 11 | class NN(): 12 | """ 13 | Deep Q-learning network with LSTM's using Keras 14 | 15 | Parameters 16 | ----------- 17 | batch_size : int 18 | Number of tuples taken into account for each iteration of gradient descent 19 | input_dimensions : tuples 20 | n_actions : int 21 | random_state : numpy random number generator 22 | Set the random seed. 23 | action_as_input : Boolean 24 | Whether the action is given as input or as output 25 | """ 26 | def __init__(self, batch_size, input_dimensions, n_actions, random_state, action_as_input=False): 27 | self._input_dimensions=input_dimensions 28 | self._batch_size=batch_size 29 | self._random_state=random_state 30 | self._n_actions=n_actions 31 | self._action_as_input=action_as_input 32 | 33 | def _buildDQN(self): 34 | """ 35 | Build a network consistent with each type of inputs 36 | """ 37 | layers=[] 38 | outs_conv=[] 39 | inputs=[] 40 | 41 | for i, dim in enumerate(self._input_dimensions): 42 | # - observation[i] is a FRAME 43 | if len(dim) == 3: 44 | input = Input(shape=(dim[0],dim[1],dim[2])) 45 | inputs.append(input) 46 | x = Convolution2D(32, 8, 8, border_mode='valid')(input) 47 | x = MaxPooling2D(pool_size=(4, 4), strides=None, border_mode='valid')(x) 48 | x = Convolution2D(64, 4, 4, border_mode='valid')(x) 49 | x = MaxPooling2D(pool_size=(2, 2), strides=None, border_mode='valid')(x) 50 | x = Convolution2D(64, 3, 3)(x) 51 | 52 | # We may add here LSTM's after having flatten the last two dimensions 53 | 54 | x = Flatten()(x) 55 | 56 | # - observation[i] is a VECTOR 57 | if len(dim) == 2: 58 | input = Input(shape=(dim[0],dim[1])) 59 | inputs.append(input) 60 | 61 | if dim[0] > 3: 62 | 63 | x = LSTM(16, 64 | activation='relu', 65 | return_sequences=True)(input) 66 | x = LSTM(16, 67 | activation='relu', 68 | return_sequences=False)(x) # Structure many-to-one 69 | 70 | else: 71 | x=input 72 | x = Flatten()(x) 73 | 74 | # - observation[i] is a SCALAR - 75 | elif(len(dim) == 1): 76 | 77 | input = Input(shape=(dim[0],)) 78 | inputs.append(input) 79 | input = Reshape((dim[0],1))(input) 80 | 81 | if dim[0] > 3: 82 | x = LSTM(8, 83 | activation='relu', 84 | return_sequences=True)(input) 85 | x = LSTM(8, 86 | activation='relu', 87 | return_sequences=False)(x) # Structure many-to-one 88 | else: 89 | x=input 90 | x = Flatten()(x) 91 | 92 | outs_conv.append(x) 93 | 94 | if (self._action_as_input==True): 95 | if ( isinstance(self._n_actions,int)): 96 | print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN") 97 | else: 98 | input = Input(shape=(len(self._n_actions),)) 99 | inputs.append(input) 100 | outs_conv.append(input) 101 | 102 | if len(outs_conv)>1: 103 | x = concatenate(outs_conv) 104 | else: 105 | x= outs_conv [0] 106 | 107 | x = Dense(50, activation='relu')(x) 108 | x = Dense(20, activation='relu')(x) 109 | 110 | if (self._action_as_input==False): 111 | if ( isinstance(self._n_actions,int)): 112 | out = Dense(self._n_actions)(x) 113 | else: 114 | out = Dense(len(self._n_actions))(x) 115 | else: 116 | out = Dense(1)(x) 117 | 118 | model = Model(inputs=inputs, outputs=out) 119 | layers=model.layers 120 | 121 | # Grab all the parameters together. 122 | params = [ param 123 | for layer in layers 124 | for param in layer.trainable_weights ] 125 | 126 | if (self._action_as_input==True): 127 | return model, params, inputs 128 | else: 129 | return model, params 130 | 131 | if __name__ == '__main__': 132 | pass 133 | 134 | -------------------------------------------------------------------------------- /deer/learning_algos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/deer/learning_algos/__init__.py -------------------------------------------------------------------------------- /deer/learning_algos/q_net_keras.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for general deep Q-learning using Keras that can take as inputs scalars, vectors and matrices 3 | 4 | .. Author: Vincent Francois-Lavet 5 | """ 6 | 7 | import numpy as np 8 | from tensorflow.keras.optimizers import SGD,RMSprop 9 | from tensorflow.keras import backend as K 10 | from ..base_classes import LearningAlgo as QNetwork 11 | from .NN_keras import NN # Default Neural network used 12 | import gc 13 | 14 | class MyQNetwork(QNetwork): 15 | """ 16 | Deep Q-learning network using Keras (with any backend) 17 | 18 | Parameters 19 | ----------- 20 | environment : object from class Environment 21 | The environment in which the agent evolves. 22 | rho : float 23 | Parameter for rmsprop. Default : 0.9 24 | rms_epsilon : float 25 | Parameter for rmsprop. Default : 0.0001 26 | momentum : float 27 | Momentum for SGD. Default : 0 28 | clip_norm : float 29 | The gradient tensor will be clipped to a maximum L2 norm given by this value. 30 | freeze_interval : int 31 | Period during which the target network is freezed and after which the target network is updated. Default : 1000 32 | batch_size : int 33 | Number of tuples taken into account for each iteration of gradient descent. Default : 32 34 | update_rule: str 35 | {sgd,rmsprop}. Default : rmsprop 36 | random_state : numpy random number generator 37 | double_Q : bool, optional 38 | Activate or not the double_Q learning. 39 | More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning. 40 | neural_network : object, optional 41 | default is deer.learning_algos.NN_keras 42 | """ 43 | 44 | def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_norm=1, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN): 45 | """ Initialize environment 46 | 47 | """ 48 | QNetwork.__init__(self,environment, batch_size) 49 | 50 | 51 | self._rho = rho 52 | self._rms_epsilon = rms_epsilon 53 | self._momentum = momentum 54 | self._clip_norm = clip_norm 55 | self._update_rule = update_rule 56 | self._freeze_interval = freeze_interval 57 | self._double_Q = double_Q 58 | self._random_state = random_state 59 | self.update_counter = 0 60 | 61 | Q_net = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state) 62 | self.q_vals, self.params = Q_net._buildDQN() 63 | 64 | self._compile() 65 | 66 | self.next_q_vals, self.next_params = Q_net._buildDQN() 67 | self.next_q_vals.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.q_vals 68 | 69 | self._resetQHat() 70 | 71 | def getAllParams(self): 72 | """ Get all parameters used by the learning algorithm 73 | 74 | Returns 75 | ------- 76 | Values of the parameters: list of numpy arrays 77 | """ 78 | params_value=[] 79 | for i,p in enumerate(self.params): 80 | params_value.append(K.get_value(p)) 81 | return params_value 82 | 83 | def setAllParams(self, list_of_values): 84 | """ Set all parameters used by the learning algorithm 85 | 86 | Arguments 87 | --------- 88 | list_of_values : list of numpy arrays 89 | list of the parameters to be set (same order than given by getAllParams()). 90 | """ 91 | for i,p in enumerate(self.params): 92 | K.set_value(p,list_of_values[i]) 93 | 94 | def train(self, states_val, actions_val, rewards_val, next_states_val, terminals_val): 95 | """ 96 | Train the Q-network from one batch of data. 97 | 98 | Parameters 99 | ----------- 100 | states_val : numpy array of objects 101 | Each object is a numpy array that relates to one of the observations 102 | with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]. 103 | actions_val : numpy array of integers with size [self._batch_size] 104 | actions[i] is the action taken after having observed states[:][i]. 105 | rewards_val : numpy array of floats with size [self._batch_size] 106 | rewards[i] is the reward obtained for taking actions[i-1]. 107 | next_states_val : numpy array of objects 108 | Each object is a numpy array that relates to one of the observations 109 | with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]. 110 | terminals_val : numpy array of booleans with size [self._batch_size] 111 | terminals[i] is True if the transition leads to a terminal state and False otherwise 112 | 113 | Returns 114 | ------- 115 | Average loss of the batch training (RMSE) 116 | Individual (square) losses for each tuple 117 | """ 118 | 119 | if self.update_counter % self._freeze_interval == 0: 120 | self._resetQHat() 121 | 122 | next_q_vals = self.next_q_vals.predict(next_states_val.tolist(), verbose=0) 123 | 124 | if(self._double_Q==True): 125 | next_q_vals_current_qnet=self.q_vals.predict(next_states_val.tolist(), verbose=0) 126 | argmax_next_q_vals=np.argmax(next_q_vals_current_qnet, axis=1) 127 | max_next_q_vals=next_q_vals[np.arange(self._batch_size),argmax_next_q_vals].reshape((-1, 1)) 128 | else: 129 | max_next_q_vals=np.max(next_q_vals, axis=1, keepdims=True) 130 | 131 | not_terminals=np.invert(terminals_val).astype(float) 132 | 133 | target = rewards_val + not_terminals * self._df * max_next_q_vals.reshape((-1)) 134 | 135 | q_vals=self.q_vals.predict(states_val.tolist(), verbose=0) 136 | 137 | # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff 138 | q_val=q_vals[np.arange(self._batch_size), actions_val] 139 | diff = - q_val + target 140 | loss_ind=pow(diff,2) 141 | 142 | q_vals[ np.arange(self._batch_size), actions_val ] = target 143 | 144 | # Is it possible to use something more flexible than this? 145 | # Only some elements of next_q_vals are actual value that I target. 146 | # My loss should only take these into account. 147 | # Workaround here is that many values are already "exact" in this update 148 | loss=self.q_vals.train_on_batch(states_val.tolist() , q_vals ) 149 | 150 | self.update_counter += 1 151 | 152 | gc.collect() #Clearing potential unused memory to avoid any memory leak 153 | 154 | # loss*self._n_actions = np.average(loss_ind) 155 | return np.sqrt(loss),loss_ind 156 | 157 | 158 | def qValues(self, state_val): 159 | """ Get the q values for one belief state 160 | 161 | Arguments 162 | --------- 163 | state_val : one belief state 164 | 165 | Returns 166 | ------- 167 | The q values for the provided belief state 168 | """ 169 | q_vals_pred=self.q_vals.predict([np.expand_dims(state,axis=0) for state in state_val], verbose=0)[0] 170 | 171 | return q_vals_pred 172 | 173 | def chooseBestAction(self, state, *args, **kwargs): 174 | """ Get the best action for a pseudo-state 175 | 176 | Arguments 177 | --------- 178 | state : one pseudo-state 179 | 180 | Returns 181 | ------- 182 | The best action : int 183 | """ 184 | q_vals = self.qValues(state) 185 | 186 | action_to_take=np.argmax(q_vals) 187 | corresponding_q_val=np.max(q_vals) 188 | gc.collect() #Clearing potential unused memory to avoid any memory leak 189 | 190 | return action_to_take,corresponding_q_val 191 | 192 | def _compile(self): 193 | """ Compile self.q_vals 194 | """ 195 | 196 | if (self._update_rule=="sgd"): 197 | optimizer = SGD(learning_rate=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm) 198 | elif (self._update_rule=="rmsprop"): 199 | optimizer = RMSprop(learning_rate=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm) 200 | else: 201 | raise Exception('The update_rule '+self._update_rule+' is not implemented.') 202 | 203 | self.q_vals.compile(optimizer=optimizer, loss='mse') 204 | 205 | 206 | def _resetQHat(self): 207 | """ Set the target Q-network weights equal to the main Q-network weights 208 | """ 209 | 210 | for i,(param,next_param) in enumerate(zip(self.params, self.next_params)): 211 | K.set_value(next_param,K.get_value(param)) 212 | 213 | self._compile() # recompile to take into account new optimizer parameters that may have changed since 214 | # self._compile() was called in __init__. FIXME: this call should ideally be done elsewhere 215 | # Not ideal to recompile everytime we change e.g. only the lr 216 | 217 | -------------------------------------------------------------------------------- /deer/policies/EpsilonGreedyPolicy.py: -------------------------------------------------------------------------------- 1 | from ..base_classes import Policy 2 | 3 | 4 | class EpsilonGreedyPolicy(Policy): 5 | """The policy acts greedily with probability :math:`1-\epsilon` and acts randomly otherwise. 6 | It is now used as a default policy for the neural agent. 7 | 8 | Parameters 9 | ----------- 10 | epsilon : float 11 | Proportion of random steps 12 | """ 13 | def __init__(self, learning_algo, n_actions, random_state, epsilon): 14 | Policy.__init__(self, learning_algo, n_actions, random_state) 15 | self._epsilon = epsilon 16 | 17 | def action(self, state, mode=None, *args, **kwargs): 18 | if self.random_state.rand() < self._epsilon: 19 | action, V = self.randomAction() 20 | else: 21 | action, V = self.bestAction(state, mode, *args, **kwargs) 22 | 23 | return action, V 24 | 25 | def setEpsilon(self, e): 26 | """ Set the epsilon used for :math:`\epsilon`-greedy exploration 27 | """ 28 | self._epsilon = e 29 | 30 | def epsilon(self): 31 | """ Get the epsilon for :math:`\epsilon`-greedy exploration 32 | """ 33 | return self._epsilon 34 | -------------------------------------------------------------------------------- /deer/policies/LongerExplorationPolicy.py: -------------------------------------------------------------------------------- 1 | """ Exploration policy for permutation invariant environments 2 | 3 | """ 4 | 5 | from ..base_classes import Policy 6 | import itertools 7 | import random 8 | import copy 9 | import numpy as np 10 | 11 | class LongerExplorationPolicy(Policy): 12 | """Simple alternative to :math:`\epsilon`-greedy that can explore more 13 | efficiently for a broad class of realistic problems. 14 | 15 | Parameters 16 | ----------- 17 | epsilon : float 18 | Proportion of random steps 19 | length : int 20 | Length of the exploration sequences that will be considered 21 | """ 22 | def __init__(self, learning_algo, n_actions, random_state, epsilon, length=10): 23 | Policy.__init__(self, learning_algo, n_actions, random_state) 24 | self._epsilon = epsilon 25 | self._l = length 26 | self._count_down = -1 27 | self._action_sequence = [] 28 | 29 | def action(self, state, mode=None, *args, **kwargs): 30 | if self._count_down >= 0: 31 | # Take the next exploration action in the sequence 32 | V = 0 33 | action = self._action_sequence[self._count_down] 34 | self._count_down -= 1 35 | else: 36 | if self.random_state.rand() < self._epsilon/((1+(self._l-1)*(1-self._epsilon))): 37 | # Take a random action and build an exploration sequence for the next steps 38 | self._count_down = self._l - 1 39 | self._action_sequence = self.sampleUniformActionSequence() 40 | action = self._action_sequence[self._count_down] 41 | V = 0 42 | self._count_down -= 1 43 | else: 44 | # Simply act greedily with respect to what is currently believed to be the best action 45 | action, V = self.bestAction(state, mode, args, kwargs) 46 | 47 | return np.array(action), V 48 | 49 | def setEpsilon(self, e): 50 | """ Set the epsilon 51 | """ 52 | self._epsilon = e 53 | 54 | def epsilon(self): 55 | """ Get the epsilon 56 | """ 57 | return self._epsilon 58 | 59 | def sampleUniformActionSequence(self): 60 | if ( isinstance(self.n_actions,int)): 61 | """ Sample an action sequence of length self._l, where the unordered sequences have uniform probabilities""" 62 | actions_list = range(self.n_actions) 63 | else: 64 | """For N exploration steps, the goal is to have actions such that their sum spans quite uniformly 65 | the whole range of possibilities. Among those possibilities, random choice/order of actions. """ 66 | 67 | possible_actions=[] 68 | # Add for all actions N random element between min and max 69 | N=3 70 | for i,a in enumerate(self.n_actions): 71 | possible_actions.append([]) 72 | for j in range(N): 73 | possible_actions[i].append( self.random_state.uniform(self.n_actions[i][0],self.n_actions[i][1]) ) 74 | actions_list = list(itertools.product(*possible_actions)) 75 | 76 | sequences_with_replacement = list(itertools.combinations_with_replacement(actions_list, self._l)) 77 | index_pick = self.random_state.randint(0, len(sequences_with_replacement)) 78 | sequence = list(sequences_with_replacement[index_pick]) 79 | self.random_state.shuffle(sequence) 80 | 81 | return sequence 82 | -------------------------------------------------------------------------------- /deer/policies/__init__.py: -------------------------------------------------------------------------------- 1 | from .EpsilonGreedyPolicy import EpsilonGreedyPolicy 2 | from .LongerExplorationPolicy import LongerExplorationPolicy -------------------------------------------------------------------------------- /deer/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/deer/tests/__init__.py -------------------------------------------------------------------------------- /deer/tests/test_base.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | # Example of tests 4 | class TestStringMethods(unittest.TestCase): 5 | 6 | def test_upper(self): 7 | self.assertEqual('foo'.upper(), 'FOO') 8 | 9 | def test_isupper(self): 10 | self.assertTrue('FOO'.isupper()) 11 | self.assertFalse('Foo'.isupper()) 12 | 13 | def test_split(self): 14 | s = 'hello world' 15 | self.assertEqual(s.split(), ['hello', 'world']) 16 | # check that s.split fails when the separator is not a string 17 | with self.assertRaises(TypeError): 18 | s.split(2) 19 | 20 | if __name__ == '__main__': 21 | unittest.main() -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/deer.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/deer.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/deer" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/deer" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | try: 3 | from unittest.mock import Mock 4 | except ImportError: 5 | from mock import Mock 6 | 7 | #class Mock(MagicMock): 8 | # @classmethod 9 | # def __getattr__(cls, name): 10 | # return Mock() 11 | # 12 | #MOCK_MODULES = ['numpy', 'scipy', 'matplotlib', 'matplotlib.pyplot', 'scipy.interpolate', 'theano', 'theano.tensor', 'joblib', 'lasagne'] 13 | #sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) 14 | 15 | sys.modules['pylearn2'] = Mock() 16 | sys.modules['pylearn2.sandbox'] = Mock() 17 | sys.modules['pylearn2.sandbox.cuda_convnet'] = Mock() 18 | sys.modules['pylearn2.sandbox.cuda_convnet.filter_acts'] = \ 19 | Mock(FilterActs=None) 20 | sys.modules['scipy'] = Mock() 21 | sys.modules['theano'] = Mock() 22 | sys.modules['theano.tensor'] = Mock() 23 | sys.modules['theano.tensor.signal'] = Mock() 24 | sys.modules['theano.tensor.nnet'] = Mock() 25 | sys.modules['joblib'] = Mock() 26 | sys.modules['lasagne'] = Mock() 27 | sys.modules['lasagne.updates'] = Mock() 28 | sys.modules['keras.models'] = Mock() 29 | sys.modules['keras.layers'] = Mock() 30 | sys.modules['keras.optimizers'] = Mock() 31 | sys.modules['keras.backend'] = Mock() 32 | sys.modules['keras'] = Mock() 33 | 34 | 35 | # -*- coding: utf-8 -*- 36 | # 37 | # deer documentation build configuration file, created by 38 | # sphinx-quickstart on Wed Apr 6 16:38:40 2016. 39 | # 40 | # This file is execfile()d with the current directory set to its 41 | # containing dir. 42 | # 43 | # Note that not all possible configuration values are present in this 44 | # autogenerated file. 45 | # 46 | # All configuration values have a default; values that are commented out 47 | # serve to show the default. 48 | 49 | import sys 50 | import os 51 | 52 | # If extensions (or modules to document with autodoc) are in another directory, 53 | # add these directories to sys.path here. If the directory is relative to the 54 | # documentation root, use os.path.abspath to make it absolute, like shown here. 55 | sys.path.insert(0, os.path.abspath('.')) 56 | 57 | # -- General configuration ------------------------------------------------ 58 | 59 | # If your documentation needs a minimal Sphinx version, state it here. 60 | #needs_sphinx = '1.0' 61 | 62 | # Add any Sphinx extension module names here, as strings. They can be 63 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 64 | # ones. 65 | extensions = [ 66 | 'sphinx.ext.autodoc', 67 | 'sphinx.ext.autosummary', 68 | 'sphinx.ext.doctest', 69 | 'sphinx.ext.mathjax', 70 | # 'sphinx.ext.viewcode', # create HTML file of source code and link to it 71 | # 'sphinx.ext.linkcode', # link to github, see linkcode_resolve() below 72 | ## 'numpydoc', # !Generates unwanted tables with autoclass! 73 | # 'sphinx.ext.napoleon', # alternative to numpydoc -- looks a bit worse. 74 | ] 75 | 76 | 77 | # Add any paths that contain templates here, relative to this directory. 78 | templates_path = ['_templates'] 79 | 80 | # The suffix of source filenames. 81 | source_suffix = '.rst' 82 | 83 | # The encoding of source files. 84 | #source_encoding = 'utf-8-sig' 85 | 86 | # The master toctree document. 87 | master_doc = 'index' 88 | 89 | # General information about the project. 90 | project = u'deer' 91 | copyright = u'2016, deer contributors' 92 | 93 | # The version info for the project you're documenting, acts as replacement for 94 | # |version| and |release|, also used in various other places throughout the 95 | # built documents. 96 | # 97 | # The short X.Y version. 98 | version = '0.4.3' 99 | # The full version, including alpha/beta/rc tags. 100 | release = '0.4.3' 101 | 102 | # The language for content autogenerated by Sphinx. Refer to documentation 103 | # for a list of supported languages. 104 | #language = None 105 | 106 | # There are two options for replacing |today|: either, you set today to some 107 | # non-false value, then it is used: 108 | #today = '' 109 | # Else, today_fmt is used as the format for a strftime call. 110 | #today_fmt = '%B %d, %Y' 111 | 112 | # List of patterns, relative to source directory, that match files and 113 | # directories to ignore when looking for source files. 114 | exclude_patterns = ['_build'] 115 | 116 | # The reST default role (used for this markup: `text`) to use for all 117 | # documents. 118 | #default_role = None 119 | 120 | # If true, '()' will be appended to :func: etc. cross-reference text. 121 | #add_function_parentheses = True 122 | 123 | # If true, the current module name will be prepended to all description 124 | # unit titles (such as .. function::). 125 | #add_module_names = True 126 | 127 | # If true, sectionauthor and moduleauthor directives will be shown in the 128 | # output. They are ignored by default. 129 | #show_authors = False 130 | 131 | # The name of the Pygments (syntax highlighting) style to use. 132 | pygments_style = 'sphinx' 133 | 134 | # A list of ignored prefixes for module index sorting. 135 | #modindex_common_prefix = [] 136 | 137 | # If true, keep warnings as "system message" paragraphs in the built documents. 138 | #keep_warnings = False 139 | 140 | 141 | # -- Options for HTML output ---------------------------------------------- 142 | 143 | # The theme to use for HTML and HTML Help pages. See the documentation for 144 | # a list of builtin themes. 145 | #html_theme = 'default' 146 | ### Read the docs style: 147 | if os.environ.get('READTHEDOCS') != 'True': 148 | try: 149 | import sphinx_rtd_theme 150 | except ImportError: 151 | pass # assume we have sphinx >= 1.3 152 | else: 153 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 154 | html_theme = 'sphinx_rtd_theme' 155 | def setup(app): 156 | app.add_stylesheet("fix_rtd.css") 157 | 158 | 159 | # Theme options are theme-specific and customize the look and feel of a theme 160 | # further. For a list of options available for each theme, see the 161 | # documentation. 162 | #html_theme_options = {} 163 | 164 | # Add any paths that contain custom themes here, relative to this directory. 165 | #html_theme_path = [] 166 | 167 | # The name for this set of Sphinx documents. If None, it defaults to 168 | # " v documentation". 169 | #html_title = None 170 | 171 | # A shorter title for the navigation bar. Default is the same as html_title. 172 | #html_short_title = None 173 | 174 | # The name of an image file (relative to this directory) to place at the top 175 | # of the sidebar. 176 | #html_logo = None 177 | 178 | # The name of an image file (within the static path) to use as favicon of the 179 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 180 | # pixels large. 181 | #html_favicon = None 182 | 183 | # Add any paths that contain custom static files (such as style sheets) here, 184 | # relative to this directory. They are copied after the builtin static files, 185 | # so a file named "default.css" will overwrite the builtin "default.css". 186 | html_static_path = ['_static'] 187 | 188 | # Add any extra paths that contain custom files (such as robots.txt or 189 | # .htaccess) here, relative to this directory. These files are copied 190 | # directly to the root of the documentation. 191 | #html_extra_path = [] 192 | 193 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 194 | # using the given strftime format. 195 | #html_last_updated_fmt = '%b %d, %Y' 196 | 197 | # If true, SmartyPants will be used to convert quotes and dashes to 198 | # typographically correct entities. 199 | #html_use_smartypants = True 200 | 201 | # Custom sidebar templates, maps document names to template names. 202 | #html_sidebars = {} 203 | 204 | # Additional templates that should be rendered to pages, maps page names to 205 | # template names. 206 | #html_additional_pages = {} 207 | 208 | # If false, no module index is generated. 209 | #html_domain_indices = True 210 | 211 | # If false, no index is generated. 212 | #html_use_index = True 213 | 214 | # If true, the index is split into individual pages for each letter. 215 | #html_split_index = False 216 | 217 | # If true, links to the reST sources are added to the pages. 218 | #html_show_sourcelink = True 219 | 220 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 221 | #html_show_sphinx = True 222 | 223 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 224 | #html_show_copyright = True 225 | 226 | # If true, an OpenSearch description file will be output, and all pages will 227 | # contain a tag referring to it. The value of this option must be the 228 | # base URL from which the finished HTML is served. 229 | #html_use_opensearch = '' 230 | 231 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 232 | #html_file_suffix = None 233 | 234 | # Output file base name for HTML help builder. 235 | htmlhelp_basename = 'deerdoc' 236 | 237 | 238 | # -- Options for LaTeX output --------------------------------------------- 239 | 240 | latex_elements = { 241 | # The paper size ('letterpaper' or 'a4paper'). 242 | #'papersize': 'letterpaper', 243 | 244 | # The font size ('10pt', '11pt' or '12pt'). 245 | #'pointsize': '10pt', 246 | 247 | # Additional stuff for the LaTeX preamble. 248 | #'preamble': '', 249 | } 250 | 251 | # Grouping the document tree into LaTeX files. List of tuples 252 | # (source start file, target name, title, 253 | # author, documentclass [howto, manual, or own class]). 254 | latex_documents = [ 255 | ('index', 'deer.tex', u'deer Documentation', 256 | u'deer contributors', 'manual'), 257 | ] 258 | 259 | # The name of an image file (relative to this directory) to place at the top of 260 | # the title page. 261 | #latex_logo = None 262 | 263 | # For "manual" documents, if this is true, then toplevel headings are parts, 264 | # not chapters. 265 | #latex_use_parts = False 266 | 267 | # If true, show page references after internal links. 268 | #latex_show_pagerefs = False 269 | 270 | # If true, show URL addresses after external links. 271 | #latex_show_urls = False 272 | 273 | # Documents to append as an appendix to all manuals. 274 | #latex_appendices = [] 275 | 276 | # If false, no module index is generated. 277 | #latex_domain_indices = True 278 | 279 | 280 | # -- Options for manual page output --------------------------------------- 281 | 282 | # One entry per manual page. List of tuples 283 | # (source start file, name, description, authors, manual section). 284 | man_pages = [ 285 | ('index', 'deer', u'deer Documentation', 286 | [u'Vincent François-Lavet'], 1) 287 | ] 288 | 289 | # If true, show URL addresses after external links. 290 | #man_show_urls = False 291 | 292 | 293 | # -- Options for Texinfo output ------------------------------------------- 294 | 295 | # Grouping the document tree into Texinfo files. List of tuples 296 | # (source start file, target name, title, author, 297 | # dir menu entry, description, category) 298 | texinfo_documents = [ 299 | ('index', 'deer', u'deer Documentation', 300 | u'deer contributors', 'deer', 'One line description of project.', 301 | 'Miscellaneous'), 302 | ] 303 | 304 | # Documents to append as an appendix to all manuals. 305 | #texinfo_appendices = [] 306 | 307 | # If false, no module index is generated. 308 | #texinfo_domain_indices = True 309 | 310 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 311 | #texinfo_show_urls = 'footnote' 312 | 313 | # If true, do not generate a @detailmenu in the "Top" node's menu. 314 | #texinfo_no_detailmenu = False 315 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to DeeR's documentation! 2 | ================================== 3 | 4 | DeeR (Deep Reinforcement) is a python library to train an agent how to behave in a given environment so as to maximize a cumulative sum of rewards (see :ref:`what-is-deer`). 5 | 6 | Here are key advantages of the library: 7 | 8 | * You have access within a single library to techniques such as Double Q-learning, prioritized Experience Replay, Deep deterministic policy gradient (DDPG), Combined Reinforcement via Abstract Representations (CRAR), etc. 9 | * This package provides a general framework where observations are made up of any number of elements (scalars, vectors or frames). 10 | * You can easily add up a validation phase that allows to stop the training process before overfitting. This possibility is useful when the environment is dependent on scarce data (e.g. limited time series). 11 | 12 | In addition, the framework is made in such a way that it is easy to 13 | 14 | * build any environment 15 | * modify any part of the learning process 16 | * use your favorite python-based framework to code your own learning algorithm or neural network architecture. The provided learning algorithms and neural network architectures are based on Keras. 17 | 18 | .. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/schema_deer.png 19 | :scale: 50 % 20 | :alt: alternate text 21 | :align: right 22 | 23 | :Figure: General schema of the different elements available in DeeR. 24 | 25 | It is a work in progress and input is welcome. Please submit any contribution via pull request. 26 | 27 | What is new 28 | ------------ 29 | Version 0.4 30 | ************ 31 | - Integration of CRAR that allows to combine the model-free and the model-based approaches via abstract representations. 32 | - Augmented documentation and some interfaces have been updated. 33 | 34 | Version 0.3 35 | ************ 36 | - Integration of different exploration/exploitation policies and possibility to easily built your own. 37 | - Integration of DDPG for continuous action spaces (see :ref:`actor-critic`) 38 | - :ref:`naming_conv` and some interfaces have been updated. This may cause broken backward compatibility. In that case, make the changes to the new convention by looking at the API in this documentation or by looking at the current version of the examples. 39 | - Additional automated tests 40 | 41 | Version 0.2 42 | *********** 43 | - Standalone python package (you can simply do ``pip install deer``) 44 | - Integration of new examples environments : :ref:`toy_env_pendulum`, :ref:`PLE` and :ref:`gym` 45 | - Double Q-learning and prioritized Experience Replay 46 | - Augmented documentation 47 | - First automated tests 48 | 49 | Future extensions: 50 | ****************** 51 | 52 | * Several agents interacting in the same environment 53 | * ... 54 | 55 | How should I cite DeeR? 56 | ************************ 57 | 58 | Please cite DeeR in your publications if you use it in your research. Here is an example BibTeX entry: 59 | 60 | .. code-block:: bash 61 | 62 | @misc{franccoislavet2016deer, 63 | title={DeeR}, 64 | author={Fran\c{c}ois-Lavet, Vincent and others}, 65 | year={2016}, 66 | howpublished={\url{https://deer.readthedocs.io/}}, 67 | } 68 | 69 | 70 | User Guide 71 | ------------ 72 | 73 | .. toctree:: 74 | :maxdepth: 2 75 | 76 | user/installation 77 | user/tutorial 78 | user/environments 79 | user/development 80 | 81 | API reference 82 | ------------- 83 | 84 | If you are looking for information on a specific function, class or method, this API is for you. 85 | 86 | .. toctree:: 87 | :maxdepth: 2 88 | 89 | modules/agents 90 | modules/controllers 91 | modules/environments 92 | modules/learning-algorithms 93 | modules/policies 94 | 95 | Indices and tables 96 | ------------------ 97 | 98 | * :ref:`genindex` 99 | * :ref:`modindex` 100 | * :ref:`search` 101 | 102 | .. _GitHub: https://github.com/VinF/Deer 103 | -------------------------------------------------------------------------------- /docs/modules/agents.rst: -------------------------------------------------------------------------------- 1 | .. _`agents`: 2 | 3 | :mod:`Agent` 4 | ============= 5 | 6 | .. automodule:: deer.agent 7 | 8 | .. autosummary:: 9 | 10 | NeuralAgent 11 | DataSet 12 | 13 | .. autoclass:: NeuralAgent 14 | :members: 15 | .. autoclass:: DataSet 16 | :members: 17 | -------------------------------------------------------------------------------- /docs/modules/controllers.rst: -------------------------------------------------------------------------------- 1 | .. _`controllers`: 2 | 3 | :mod:`Controller` 4 | =================== 5 | 6 | .. automodule:: deer.experiment.base_controllers 7 | 8 | 9 | .. autosummary:: 10 | 11 | Controller 12 | LearningRateController 13 | EpsilonController 14 | DiscountFactorController 15 | TrainerController 16 | InterleavedTestEpochController 17 | FindBestController 18 | 19 | .. autoclass:: Controller 20 | :members: 21 | .. autoclass:: LearningRateController 22 | :show-inheritance: 23 | .. autoclass:: EpsilonController 24 | :show-inheritance: 25 | .. autoclass:: DiscountFactorController 26 | :show-inheritance: 27 | .. autoclass:: TrainerController 28 | :show-inheritance: 29 | .. autoclass:: InterleavedTestEpochController 30 | :show-inheritance: 31 | .. autoclass:: FindBestController 32 | :show-inheritance: 33 | -------------------------------------------------------------------------------- /docs/modules/environments.rst: -------------------------------------------------------------------------------- 1 | .. _`env_interface`: 2 | 3 | :mod:`Environment` 4 | ============================= 5 | 6 | .. automodule:: deer.base_classes.environment 7 | 8 | .. autoclass:: deer.base_classes.Environment 9 | :members: 10 | 11 | -------------------------------------------------------------------------------- /docs/modules/learning-algorithms.rst: -------------------------------------------------------------------------------- 1 | .. _learning-algorithms: 2 | 3 | :mod:`Learning algorithms` 4 | ========================== 5 | 6 | .. autosummary:: 7 | deer.base_classes.LearningAlgo 8 | deer.learning_algos.q_net_keras.MyQNetwork 9 | deer.learning_algos.AC_net_keras.MyACNetwork 10 | deer.learning_algos.CRAR_keras.CRAR 11 | 12 | .. autoclass:: deer.base_classes.LearningAlgo 13 | :members: 14 | 15 | .. autoclass:: deer.learning_algos.q_net_keras.MyQNetwork 16 | :members: 17 | 18 | .. autoclass:: deer.learning_algos.AC_net_keras.MyACNetwork 19 | :members: 20 | 21 | .. autoclass:: deer.learning_algos.CRAR_keras.CRAR 22 | :members: 23 | -------------------------------------------------------------------------------- /docs/modules/policies.rst: -------------------------------------------------------------------------------- 1 | .. _policies: 2 | 3 | :mod:`Policies` 4 | ========================== 5 | 6 | .. autosummary:: 7 | 8 | deer.base_classes.Policy 9 | deer.policies.EpsilonGreedyPolicy 10 | deer.policies.LongerExplorationPolicy 11 | 12 | .. autoclass:: deer.base_classes.Policy 13 | :members: 14 | .. autoclass:: deer.policies.EpsilonGreedyPolicy 15 | :members: 16 | :show-inheritance: 17 | .. autoclass:: deer.policies.LongerExplorationPolicy 18 | :members: 19 | :show-inheritance: 20 | -------------------------------------------------------------------------------- /docs/user/development.rst: -------------------------------------------------------------------------------- 1 | .. _dev: 2 | 3 | Development 4 | =========== 5 | 6 | DeeR is a work in progress and contributions are welcome via pull request. 7 | 8 | For more information, you can check out this link : |how_to_contrib|. 9 | 10 | .. |how_to_contrib| raw:: html 11 | 12 | Contributing to an open source Project on github 13 | 14 | 15 | You should also make sure that you install the repository approriately for development (see :ref:`dev-install`). 16 | 17 | Guidelines for this project 18 | --------------------------- 19 | 20 | Here are a few guidelines for this project. 21 | 22 | * Simplicity: Be easy to use but also easy to understand when one digs into the code. Any additional code should be justified by the usefulness of the feature. 23 | * Modularity: The user should be able to easily use its own code with any part of the deer framework (probably at the exception of the core of agent.py that is coded in a very general way). 24 | 25 | These guidelines come of course in addition to all good practices for open source development. 26 | 27 | .. _naming_conv: 28 | 29 | Naming convention for this project 30 | ---------------------------------- 31 | 32 | * All classes and methods have word boundaries using medial capitalization. Classes are written with UpperCamelCase and methods are written with lowerCamelCase respectively. Example: "two words" is rendered as "TwoWords" for the UpperCamelCase (classes) and "twoWords" for the lowerCamelCase (methods). 33 | * All attributes and variables have words separated by underscores. Example: "two words" is rendered as "two_words" 34 | * If a variable is intended to be 'private', it is prefixed by an underscore. 35 | 36 | -------------------------------------------------------------------------------- /docs/user/environments.rst: -------------------------------------------------------------------------------- 1 | .. _examples: 2 | 3 | Examples 4 | ======== 5 | 6 | You can find these examples at the |package_root|. For each example at least two files are provided: 7 | 8 | * A launcher file (whose name usually starts by ``run_``). 9 | * An environnement file (whose name usually ends by ``_env``). 10 | 11 | 12 | .. |package_root| raw:: html 13 | 14 | root of the package 15 | 16 | 17 | The launcher file performs different actions: 18 | 19 | * It instantiates the environment and the agent along with a learning algorithm (such as a q-network). 20 | * It binds controllers to the agent 21 | * it finally runs the experiment 22 | 23 | You can get started with the following examples: 24 | 25 | .. toctree:: 26 | :maxdepth: 2 27 | 28 | environments/toy_env_time_series.rst 29 | environments/gym.rst 30 | environments/two_storages.rst 31 | environments/planning.rst 32 | environments/ALE.rst 33 | -------------------------------------------------------------------------------- /docs/user/environments/ALE.rst: -------------------------------------------------------------------------------- 1 | .. _ale: 2 | 3 | 4 | :mod:`ALE environment` 5 | ======================= 6 | 7 | This environment is an interface with the |ALE_link| that simulates any ATARI game. 8 | 9 | Related paper: Mnih, Volodymyr, et al. "Human-level control through deep reinforcement learning." Nature 518.7540 (2015): 529-533. (Hyper-parameters tuning is necessary if you want to try to replicate close performances.) 10 | 11 | 12 | .. |ALE_link| raw:: html 13 | 14 | ALE environment 15 | -------------------------------------------------------------------------------- /docs/user/environments/PLE.rst: -------------------------------------------------------------------------------- 1 | .. _ple: 2 | 3 | :mod:`PLE environment` 4 | ======================= 5 | 6 | This environment is an interface with the |PLE_link|. The provided example shows how to successfully learn a good policy on the simple |catcher_link| in a few epochs (~10). You should easily be able to learn successful policies for all the games provided (possibly with some hyper-parameters tuning). 7 | 8 | .. |PLE_link| raw:: html 9 | 10 | PLE environment 11 | 12 | .. |catcher_link| raw:: html 13 | 14 | "catcher" game 15 | -------------------------------------------------------------------------------- /docs/user/environments/gym.rst: -------------------------------------------------------------------------------- 1 | .. _gym: 2 | 3 | :mod:`Gym environment` 4 | ======================= 5 | 6 | Some examples are also provided with the |Gym_link|. 7 | 8 | .. |Gym_link| raw:: html 9 | 10 | Gym environment 11 | 12 | Here is the resulting policy for the mountain car example: 13 | 14 | .. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/gym_mountain_car.gif 15 | :width: 500 px 16 | :align: center 17 | 18 | Here is the resulting policy for the pendulum example: 19 | 20 | .. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/gym_pendulum.gif 21 | :width: 500 px 22 | :align: center 23 | 24 | -------------------------------------------------------------------------------- /docs/user/environments/planning.rst: -------------------------------------------------------------------------------- 1 | .. _planning: 2 | 3 | 4 | :mod:`Tasks with planning` 5 | ======================= 6 | 7 | You can find the following environments that demonstrate the possibilities of combining model-based and model-free: |CRAR_tests| and |CRAR_maze|. 8 | 9 | .. |CRAR_tests| raw:: html 10 | 11 | simples examples 12 | 13 | .. |CRAR_maze| raw:: html 14 | 15 | how to solve any maze taken from a distribution 16 | -------------------------------------------------------------------------------- /docs/user/environments/toy_env_time_series.rst: -------------------------------------------------------------------------------- 1 | .. _toy_env_time_series: 2 | 3 | :mod:`Toy environment with time series` 4 | ======================================= 5 | 6 | Description of the environement 7 | ############################### 8 | 9 | This environment simulates the possibility of buying or selling a good. The agent can either have one unit or zero unit of that good. At each transaction with the market, the agent obtains a reward equivalent to the price of the good when selling it and the opposite when buying. In addition, a penalty of 0.5 (negative reward) is added for each transaction. 10 | 11 | The price pattern is made by repeating the following signal plus a random constant between 0 and 3: 12 | 13 | .. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/plot_toy_example_signal.png 14 | :width: 250 px 15 | :alt: Toy example price pattern 16 | :align: center 17 | 18 | You can see how this environement is built by looking into the file ``Toy_env.py`` in |toy_env_folder|. It is important to note that any environment derives from the base class :ref:`env_interface` and you can refer to it in order to understand the required methods and their usage. 19 | 20 | .. |toy_env_folder| raw:: html 21 | 22 | examples/toy_env/ 23 | 24 | .. 25 | The price signal is built following the same rules for the training and the validation environments which allows the agent to learn a strategy that exploits this successfully. 26 | 27 | 28 | .. literalinclude:: ../../../examples/toy_env/Toy_env.py 29 | :language: python 30 | :lines: 21-75 31 | 32 | .. literalinclude:: ../../../examples/toy_env/Toy_env.py 33 | :language: python 34 | :lines: 116-130 35 | 36 | 37 | How to run 38 | ########## 39 | 40 | A minimalist way of running this example can be found in the file ``run_toy_env_simple.py`` in |toy_env_folder|. 41 | 42 | * First, we need to import the agent, the Q-network, the environement and some controllers 43 | 44 | .. literalinclude:: ../../../examples/toy_env/run_toy_env_simple.py 45 | :language: python 46 | :lines: 6-11 47 | :linenos: 48 | 49 | 50 | * Then we instantiate the different elements as follows: 51 | 52 | .. literalinclude:: ../../../examples/toy_env/run_toy_env_simple.py 53 | :language: python 54 | :lines: 13-51 55 | :linenos: 56 | 57 | 58 | Results 59 | ######## 60 | 61 | Navigate to the folder ``examples/toy_env/`` in a terminal window. The example can then be run by using 62 | 63 | .. code-block:: bash 64 | 65 | python run_toy_env_simple.py 66 | 67 | You can also choose the full version of the launcher that specifies the hyperparameters for better performance. 68 | 69 | .. code-block:: bash 70 | 71 | python run_toy_env.py 72 | 73 | Every 10 epochs, a graph is saved in the 'toy_env' folder. You can then visualize the test policy at the end of the training: 74 | 75 | .. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/plot_toy_example.png 76 | :width: 250 px 77 | :alt: Toy example policy 78 | :align: center 79 | 80 | 81 | In this graph, you can see that the agent has successfully learned to take advantage of the price pattern to buy when it is low and to sell when it is high. This example is of course easy due to the fact that the patterns are very systematic which allows the agent to successfully learn it. It is important to note that the results shown are made on a validation set that is different from the training and we can see that learning generalizes well. For instance, the action of buying at time step 7 and 16 is the expected result because in average this will allow to make profit since the agent has no information on the future. 82 | 83 | Using Convolutions VS LSTM's 84 | ############################ 85 | 86 | So far, the neural network was build by using a convolutional architecture as follows: 87 | 88 | .. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/Convolutions_architecture.png 89 | :width: 350 px 90 | :alt: Convolutions architecture 91 | :align: center 92 | 93 | The neural nework processes time series thanks to a set of convolutions layers. The output of the convolutions as well as the other inputs are followed by fully connected layers and the ouput layer. 94 | 95 | 96 | When working with deep reinforcement learning, it is also possible to work with LSTM's (see for instance this |introduction-LSTM|) 97 | 98 | .. |introduction-LSTM| raw:: html 99 | 100 | introduction to LSTM's 101 | 102 | If you want to use LSTM's architecture, you can import the following libraries 103 | 104 | .. code-block:: bash 105 | 106 | from deer.learning_algos.NN_keras_LSTM import NN as NN_keras 107 | 108 | and then instanciate the qnetwork by specifying the 'neural_network' as follows: 109 | 110 | .. code-block:: bash 111 | 112 | qnetwork = MyQNetwork( 113 | env, 114 | neural_network=NN_keras) 115 | -------------------------------------------------------------------------------- /docs/user/environments/two_storages.rst: -------------------------------------------------------------------------------- 1 | .. _two_storages: 2 | 3 | :mod:`Two storage devices environment` 4 | ======================================== 5 | 6 | Description of the environement 7 | ############################### 8 | 9 | This example simulates the operation of a realistic micro-grid (such as a smart home for instance) that is not connected to the main utility grid (off-grid) and that is provided with PV panels, batteries and hydrogen storage. The battery has the advantage that it is not limited in instaneous power that it can provide or store. The hydrogen storage has the advantage that is can store very large quantity of energy. 10 | 11 | .. code-block:: bash 12 | 13 | python run_MG_two_storage_devices 14 | 15 | 16 | This example uses the environment defined in MG_two_storage_devices_env.py. The agent can either choose to store in the long term storage or take energy out of it while the short term storage handle at best the lack or surplus of energy by discharging itself or charging itself respectively. Whenever the short term storage is empty and cannot handle the net demand a penalty (negative reward) is obtained equal to the value of loss load set to 2euro/kWh. 17 | 18 | The state of the agent is made up of an history of two to four punctual observations: 19 | 20 | * Charging state of the short term storage (0 is empty, 1 is full) 21 | * Production and consumption (0 is no production or consumption, 1 is maximal production or consumption) 22 | * (Distance to equinox) 23 | * (Predictions of future production : average of the production for the next 24 hours and 48 hours) 24 | 25 | Two actions are possible for the agent: 26 | 27 | * Action 0 corresponds to discharging the long-term storage 28 | * Action 1 corresponds to charging the long-term storage 29 | 30 | More information can be found in 31 | `Deep Reinforcement Learning Solutions for Energy Microgrids Management`_, Vincent François-Lavet, David Taralla, Damien Ernst, Raphael Fonteneau 32 | 33 | .. _Deep Reinforcement Learning Solutions for Energy Microgrids Management: https://ewrl.files.wordpress.com/2016/11/ewrl13-2016-submission_21.pdf 34 | 35 | Annex to the paper 36 | ################## 37 | 38 | .. 39 | Neural network architecture 40 | *************************** 41 | 42 | We propose a neural network architecture where the inputs are provided by the state vector, and where each separate output represents the Q-value function for one of the discretized actions. The action :math:`a_t` to be made at time :math:`t` is whether to charge or discharge the hydrogen storage device with the assumption that the batteries handle at best the current demand (avoid any value of loss load whenever possible). We consider three discretized actions : (i) discharge at full rate the hydrogen storage, (ii) keep it idle or (iii) charge it at full rate. 43 | 44 | The neural network process time series thanks to a set of convolutions that convolves 16 filters of :math:`2 \times 1` with stride 1 followed by a convolution with 16 filters of :math:`2 \times 2` with stride 1. The output of the convolutions as well as the other inputs are then followed by two fully connected layers with 50 and 20 neurons and the ouput layer. The activation function used is the Rectified Linear Unit (ReLU) except for the output layer where no activation function is used. 45 | 46 | .. figure:: http://vincent.francois-l.be/img_GeneralDeepQRL/Convolutions_architecture.png 47 | :width: 400 px 48 | :align: center 49 | 50 | Sketch of the structure of the neural network architecture (without representing the actual number of neurons in each layer). The neural network processes time series thanks to a set of convolutions layers. The output of the convolutions as well as the other inputs are followed by fully connected layers and the ouput layer. 51 | 52 | 53 | PV production and consumption profiles 54 | ************************************** 55 | Solar irradiance varies throughout the year depending on the seasons, and it also varies throughout the day depending on the weather and the position of the sun in the sky relative to the PV panels. The main distinction between these profiles is the difference between summer and winter PV production. In particular, production varies with a factor 1:5 between winter and summer as can be seen from the measurements of PV panels production for a residential customer located in Belgium in the figures below. 56 | 57 | .. figure:: http://vincent.francois-l.be/img_GeneralDeepQRL/ProductionVSMonths_be.png 58 | :width: 300 px 59 | :align: center 60 | 61 | Total energy produced per month 62 | 63 | .. figure:: http://vincent.francois-l.be/img_GeneralDeepQRL/ProductionVSTime_1janv_be.png 64 | :width: 300 px 65 | :align: center 66 | 67 | Typical production in winter 68 | 69 | .. figure:: http://vincent.francois-l.be/img_GeneralDeepQRL/ProductionVSTime_1july_be.png 70 | :width: 300 px 71 | :align: center 72 | 73 | Typical production in summer 74 | 75 | 76 | A simple residential consumption profile is considered with a daily average consumption of 18kWh (see figure below). 77 | 78 | .. figure:: http://vincent.francois-l.be/img_GeneralDeepQRL/ConsumptionVSTime_random.png 79 | :width: 300 px 80 | :align: center 81 | 82 | Representative residential consumption profile 83 | 84 | 85 | 86 | Main microgrid parameters 87 | ************************** 88 | 89 | .. list-table:: Data used for the PV panels 90 | :widths: 30 10 20 91 | 92 | * - cost 93 | - :math:`c^{PV}` 94 | - :math:`1 euro/W_p` 95 | * - Efficiency 96 | - :math:`\eta^{PV}` 97 | - :math:`18 \%` 98 | * - Life time 99 | - :math:`L^{PV}` 100 | - :math:`20 years` 101 | 102 | .. list-table:: Data used for the :math:`LiFePO_4` battery 103 | :widths: 30 10 20 104 | 105 | * - cost 106 | - :math:`c^B` 107 | - :math:`500 euro/kWh` 108 | * - discharge efficiency 109 | - :math:`\eta_0^B` 110 | - :math:`90\%` 111 | * - charge efficiency 112 | - :math:`\zeta_0^B` 113 | - :math:`90\%` 114 | * - Maximum instantaneous power 115 | - :math:`P^B` 116 | - :math:`> 10kW` 117 | * - Life time 118 | - :math:`L^{B}` 119 | - :math:`20 years` 120 | 121 | .. list-table:: Data used for the Hydrogen storage device 122 | :widths: 30 10 20 123 | 124 | * - cost 125 | - :math:`c^{H_2}` 126 | - :math:`14 euro/W_p` 127 | * - discharge efficiency 128 | - :math:`\eta_0^{H_2}` 129 | - :math:`65\%` 130 | * - charge efficiency 131 | - :math:`\zeta_0^{H_2}` 132 | - :math:`65\%` 133 | * - Life time 134 | - :math:`L^{H_2}` 135 | - :math:`20 years` 136 | 137 | .. list-table:: Data used for reward function 138 | :widths: 30 10 20 139 | 140 | * - cost endured per kWh not supplied within the microgrid 141 | - :math:`k` 142 | - :math:`2 euro/kWh` 143 | * - revenue/cost per kWh of hydrogen produced/used 144 | - :math:`k^{H_2}` 145 | - :math:`0.1 euro/kWh` 146 | -------------------------------------------------------------------------------- /docs/user/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | Installation 4 | ============== 5 | 6 | 7 | Dependencies 8 | -------------- 9 | 10 | This framework is tested to work under Python 3.6. 11 | 12 | The required dependencies are NumPy >= 1.10, joblib >= 0.9. You also need keras or you can write your own learning algorithms using your favorite deep learning framework. 13 | 14 | For running some of the examples, Matplotlib >= 1.1.1 is required. You also sometimes need to install specific dependencies (e.g. for the atari games, you need to install ALE >= 0.4). 15 | 16 | 17 | We recommend to use the bleeding-edge version and to install it by following the :ref:`dev-install`. If you want a simpler installation procedure and do not intend to modify yourself the learning algorithms etc., you can look at the :ref:`user-install`. 18 | 19 | .. _dev-install: 20 | 21 | Developer install instructions 22 | ------------------------------- 23 | 24 | As a developer, you can set you up with the bleeding-edge version of DeeR with: 25 | 26 | .. code-block:: bash 27 | 28 | git clone -b master https://github.com/VinF/deer.git 29 | 30 | Assuming you already have a python environment with ``pip``, you can automatically install all the dependencies (except specific dependencies that you may need for some examples) with: 31 | 32 | .. code-block:: bash 33 | 34 | pip install -r requirements.txt 35 | 36 | 37 | And you can install the framework as a package using the mode ``develop`` so that you can make modifications and test without having to re-install the package. 38 | 39 | .. code-block:: bash 40 | 41 | python setup.py develop 42 | 43 | 44 | .. _user-install: 45 | 46 | User install instructions 47 | -------------------------- 48 | 49 | You can install the framework with pip: 50 | 51 | .. code-block:: bash 52 | 53 | pip install deer 54 | 55 | For the bleeding edge version (recommended), you can simply use 56 | 57 | .. code-block:: bash 58 | 59 | pip install git+git://github.com/VINF/deer.git@master 60 | 61 | 62 | .. 63 | If you want to update it to the bleeding edge version you can use pip for this with the command line below: 64 | 65 | .. code-block:: bash 66 | 67 | pip install --upgrade --no-deps git+git://github.com/VinF/deer 68 | 69 | 70 | -------------------------------------------------------------------------------- /docs/user/tutorial.rst: -------------------------------------------------------------------------------- 1 | Tutorial 2 | ========= 3 | 4 | .. _what-is-deer: 5 | 6 | What is deep reinforcement learning? 7 | ------------------------------------ 8 | 9 | Deep reinforcement learning is the combination of two fields: 10 | 11 | * *Reinforcement learning (RL)* is a theory that allows an agent to learn a startegy so as to maximize a sum of cumulated (delayed) rewards from any given environment. If you are not familiar with RL, you can get up to speed easily with the |SB_link|. 12 | 13 | .. |SB_link| raw:: html 14 | 15 | book by Sutton and Barto 16 | 17 | 18 | * *Deep learning* is a branch of machine learning for regression and classification. It is particularly well suited to model high-level abstractions in data by using multiple processing layers composed of multiple non-linear transformations. 19 | 20 | This combination allows to learn complex tasks such as playing ATARI games from high-dimensional sensory inputs. For more information, you can refer to this |intro-deep-RL|. 21 | 22 | .. |intro-deep-RL| raw:: html 23 | 24 | introduction to deep reinforcement learning 25 | 26 | .. 27 | How does it work? 28 | ------------------- 29 | 30 | In RL, there are two main parts: 31 | 32 | * An agent with learning capabilities. 33 | * An environment. 34 | 35 | The environment defines the task to be performed by the agent with the following elements: 36 | 37 | * a set of environment states S 38 | * a set of actions A 39 | * a dynamics of the system, i.e. rules of transitioning between states 40 | * a reward function, i.e rules that determine the immediate reward (scalar) of a transition 41 | * a set of obsevrations O, that may be the same than S (MDP case) or different (POMDP case) 42 | 43 | 44 | How can I get started? 45 | ----------------------- 46 | 47 | First, make sure you have installed the package properly by following the steps described in :ref:`installation`. 48 | 49 | The general idea of this framework is that you need to instantiate an agent (along with a learning algorithm) and an environment. In order to perform an experiment, you also need to attach to the agent some controllers for controlling the training and the various parameters of your agent. 50 | 51 | The environment should be built specifically for any specific task while learning algorithms (such as q-networks) and many controllers are provided within this package. 52 | 53 | The best to get started is to have a look at the :ref:`examples` and in particular the two first environments that are simple to understand: 54 | 55 | * :ref:`toy_env_time_series` 56 | * :ref:`gym` 57 | 58 | If you find something that is not yet implemented and if you wish to contribute, you can check the section :ref:`dev`. 59 | 60 | .. 61 | From there, you can look at this documentation for more informations on the controllers and the other environments. 62 | 63 | Any Question? 64 | ------------- 65 | 66 | .. |Google_group| raw:: html 67 | 68 | https://groups.google.com/forum/#!forum/deer-library 69 | 70 | .. |Deer_issues| raw:: html 71 | 72 | https://github.com/VinF/deer/issues 73 | 74 | You can raise questions about the DeeR project on github : |Deer_issues| 75 | 76 | -------------------------------------------------------------------------------- /examples/ALE/ALE_env.py: -------------------------------------------------------------------------------- 1 | """ Interface with the ALE environment 2 | 3 | """ 4 | 5 | import numpy as np 6 | import cv2 7 | from ale_python_interface import ALEInterface 8 | from deer.base_classes import Environment 9 | 10 | from mpl_toolkits.axes_grid1 import host_subplot 11 | import mpl_toolkits.axisartist as AA 12 | import matplotlib.pyplot as plt 13 | 14 | class MyEnv(Environment): 15 | VALIDATION_MODE = 0 16 | 17 | def __init__(self, rng, rom="ale/breakout.bin", frame_skip=4, 18 | ale_options=[{"key": "random_seed", "value": 0}, 19 | {"key": "color_averaging", "value": True}, 20 | {"key": "repeat_action_probability", "value": 0.}]): 21 | self._mode = -1 22 | self._mode_score = 0.0 23 | self._mode_episode_count = 0 24 | 25 | self._frame_skip = frame_skip if frame_skip >= 1 else 1 26 | self._random_state = rng 27 | 28 | self._ale = ALEInterface() 29 | for option in ale_options: 30 | t = type(option["value"]) 31 | if t is int: 32 | self._ale.setInt(option["key"], option["value"]) 33 | elif t is float: 34 | self._ale.setFloat(option["key"], option["value"]) 35 | elif t is bool: 36 | self._ale.setBool(option["key"], option["value"]) 37 | else: 38 | raise ValueError("Option {} ({}) is not an int, bool or float.".format(option["key"], t)) 39 | self._ale.loadROM(rom) 40 | 41 | w, h = self._ale.getScreenDims() 42 | self._screen = np.empty((h, w), dtype=np.uint8) 43 | self._reduced_screen = np.empty((84, 84), dtype=np.uint8) 44 | self._actions = self._ale.getMinimalActionSet() 45 | 46 | 47 | def reset(self, mode): 48 | if mode == MyEnv.VALIDATION_MODE: 49 | if self._mode != MyEnv.VALIDATION_MODE: 50 | self._mode = MyEnv.VALIDATION_MODE 51 | self._mode_score = 0.0 52 | self._mode_episode_count = 0 53 | else: 54 | self._mode_episode_count += 1 55 | elif self._mode != -1: # and thus mode == -1 56 | self._mode = -1 57 | 58 | self._ale.reset_game() 59 | for _ in range(self._random_state.randint(15)): 60 | self._ale.act(0) 61 | self._ale.getScreenGrayscale(self._screen) 62 | cv2.resize(self._screen, (84, 84), self._reduced_screen, interpolation=cv2.INTER_NEAREST) 63 | 64 | return [4 * [84 * [84 * [0]]]] 65 | 66 | 67 | def act(self, action): 68 | action = self._actions[action] 69 | 70 | reward = self._ale.act(action) 71 | #if self.inTerminalState(): 72 | # break 73 | 74 | self._ale.getScreenGrayscale(self._screen) 75 | cv2.resize(self._screen, (84, 84), self._reduced_screen, interpolation=cv2.INTER_NEAREST) 76 | 77 | self._mode_score += reward 78 | return np.sign(reward) 79 | 80 | def summarizePerformance(self, test_data_set, *args, **kwargs): 81 | if self.inTerminalState() == False: 82 | self._mode_episode_count += 1 83 | print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count)) 84 | 85 | 86 | def inputDimensions(self): 87 | return [(4, 84, 84)] 88 | 89 | def observationType(self, subject): 90 | return np.uint8 91 | 92 | def nActions(self): 93 | return len(self._actions) 94 | 95 | def observe(self): 96 | return [np.array(self._reduced_screen)] 97 | 98 | def inTerminalState(self): 99 | return self._ale.game_over() 100 | 101 | 102 | 103 | if __name__ == "__main__": 104 | pass -------------------------------------------------------------------------------- /examples/ALE/ALE_env_gym.py: -------------------------------------------------------------------------------- 1 | """ Interface with the ALE environment 2 | 3 | Authors: Vincent Francois-Lavet 4 | """ 5 | import numpy as np 6 | np.set_printoptions(threshold=np.nan) 7 | import cv2 8 | #from ale_python_interface import ALEInterface 9 | import gym 10 | from deer.base_classes import Environment 11 | 12 | #import matplotlib 13 | #matplotlib.use('qt5agg') 14 | #from mpl_toolkits.axes_grid1 import host_subplot 15 | #import mpl_toolkits.axisartist as AA 16 | #import matplotlib.pyplot as plt 17 | #from PIL import Image 18 | 19 | class MyEnv(Environment): 20 | VALIDATION_MODE = 0 21 | 22 | def __init__(self, rng, **kwargs): 23 | """ Initialize environment. 24 | 25 | Arguments: 26 | rng - the numpy random number generator 27 | """ 28 | if(bool(kwargs["game"])): 29 | self.env = gym.make(kwargs["game"]) 30 | else: 31 | # Choice between Seaquest-v4, Breakout-v4, SpaceInvaders-v4, BeamRider-v4, Qbert-v4, Freeway-v4', etc. 32 | self.env = gym.make('Seaquest-v4') 33 | self._random_state=rng 34 | self.env.reset() 35 | frame_skip=kwargs.get('frame_skip',1) 36 | self._frame_skip = frame_skip if frame_skip >= 1 else 1 37 | 38 | self._screen=np.average(self.env.render(mode='rgb_array'),axis=-1) 39 | self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_LINEAR) 40 | #decide whether you want to keep this in repo, if so: add dependency to cv2 41 | #plt.imshow(self._reduced_screen, cmap='gray') 42 | #plt.show() 43 | 44 | self._mode = -1 45 | self._mode_score = 0.0 46 | self._mode_episode_count = 0 47 | 48 | 49 | 50 | def reset(self, mode): 51 | if mode == self._mode: 52 | # already in the right mode 53 | self._mode_episode_count += 1 54 | else: 55 | # switching mode 56 | self._mode = mode 57 | self._mode_score = 0.0 58 | self._mode_episode_count = 0 59 | 60 | self.env.reset() 61 | for _ in range(self._random_state.randint(15)): 62 | action = self.env.action_space.sample() 63 | 64 | # this executes the environment with an action, 65 | # and returns the observation of the environment, 66 | # the reward, if the env is over, and other info. 67 | observation, reward, self.terminal, info = self.env.step(action) 68 | 69 | self._screen=np.average(self.env.render(mode='rgb_array'),axis=-1) 70 | self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_LINEAR) 71 | self.state=np.zeros((84,84), dtype=np.uint8) #FIXME 72 | 73 | return [1*[4 * [84 * [84 * [0]]]]] 74 | 75 | 76 | def act(self, action): 77 | #print "action" 78 | #print action 79 | 80 | self.state=np.zeros((4,84,84), dtype=np.float) 81 | reward=0 82 | for t in range(4): 83 | observation, r, self.terminal, info = self.env.step(action) 84 | #print "observation, reward, self.terminal" 85 | #print observation, reward, self.terminal 86 | reward+=r 87 | if self.inTerminalState(): 88 | break 89 | 90 | self._screen=np.average(observation,axis=-1) # Gray levels 91 | self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_NEAREST) # 84*84 92 | #plt.imshow(self._screen, cmap='gray') 93 | #plt.show() 94 | self.state[t,:,:]=self._reduced_screen 95 | 96 | self._mode_score += reward 97 | return np.sign(reward) 98 | 99 | def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs): 100 | if self.inTerminalState() == False: 101 | self._mode_episode_count += 1 102 | print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count)) 103 | 104 | 105 | def inputDimensions(self): 106 | return [(1, 4, 84, 84)] 107 | 108 | def observationType(self, subject): 109 | return np.float16 110 | 111 | def nActions(self): 112 | print ("self.env.action_space") 113 | print (self.env.action_space) 114 | return self.env.action_space.n 115 | 116 | def observe(self): 117 | return [(np.array(self.state)-128.)/128.] 118 | 119 | def inTerminalState(self): 120 | return self.terminal 121 | 122 | 123 | 124 | if __name__ == "__main__": 125 | pass -------------------------------------------------------------------------------- /examples/ALE/run_ALE.py: -------------------------------------------------------------------------------- 1 | """ALE launcher. See Wiki for more details about this experiment. 2 | 3 | """ 4 | 5 | import sys 6 | import logging 7 | import numpy as np 8 | from joblib import hash, dump,load 9 | import os 10 | 11 | from deer.default_parser import process_args 12 | from deer.agent import NeuralAgent 13 | from deer.learning_algos.q_net_keras import MyQNetwork 14 | from ALE_env_gym import MyEnv as ALE_env 15 | import deer.experiment.base_controllers as bc 16 | 17 | from deer.policies import EpsilonGreedyPolicy 18 | 19 | class Defaults: 20 | # ---------------------- 21 | # Experiment Parameters 22 | # ---------------------- 23 | STEPS_PER_EPOCH = 10000#250000 24 | EPOCHS = 500#40 25 | STEPS_PER_TEST = 2000#125000 26 | PERIOD_BTW_SUMMARY_PERFS = 1 27 | 28 | # ---------------------- 29 | # Environment Parameters 30 | # ---------------------- 31 | FRAME_SKIP = 4 32 | 33 | # ---------------------- 34 | # DQN Agent parameters: 35 | # ---------------------- 36 | UPDATE_RULE = 'rmsprop' 37 | LEARNING_RATE = 0.0002 38 | LEARNING_RATE_DECAY = 1.#0.99 39 | DISCOUNT = 0.95 40 | DISCOUNT_INC = 0.99 41 | DISCOUNT_MAX = 0.99 42 | RMS_DECAY = 0.9 43 | RMS_EPSILON = 0.0001 44 | MOMENTUM = 0 45 | CLIP_NORM = 1.0 46 | EPSILON_START = 1.0 47 | EPSILON_MIN = .1 48 | EPSILON_DECAY = 100000 49 | UPDATE_FREQUENCY = 1 50 | REPLAY_MEMORY_SIZE = 250000 #Each element is 4 frames --> 10^6 frames 51 | BATCH_SIZE = 32 52 | FREEZE_INTERVAL = 2500 53 | DETERMINISTIC = True 54 | 55 | 56 | if __name__ == "__main__": 57 | logging.basicConfig(level=logging.INFO) 58 | 59 | # --- Parse parameters --- 60 | parameters = process_args(sys.argv[1:], Defaults) 61 | if parameters.deterministic: 62 | rng = np.random.RandomState(123456) 63 | else: 64 | rng = np.random.RandomState() 65 | 66 | # --- Instantiate environment --- 67 | #env = ALE_env(rng, frame_skip=parameters.frame_skip, 68 | # ale_options=[{"key": "random_seed", "value": rng.randint(9999)}, 69 | # {"key": "color_averaging", "value": True}, 70 | # {"key": "repeat_action_probability", "value": 0.}]) 71 | 72 | env = ALE_env(rng, game=parameters.param1, frame_skip=parameters.frame_skip) 73 | 74 | # --- Instantiate qnetwork --- 75 | qnetwork = MyQNetwork( 76 | env, 77 | parameters.rms_decay, 78 | parameters.rms_epsilon, 79 | parameters.momentum, 80 | parameters.clip_norm, 81 | parameters.freeze_interval, 82 | parameters.batch_size, 83 | parameters.update_rule, 84 | rng, 85 | double_Q=True) 86 | 87 | train_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.) 88 | test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05) 89 | 90 | # --- Instantiate agent --- 91 | agent = NeuralAgent( 92 | env, 93 | qnetwork, 94 | parameters.replay_memory_size, 95 | max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), 96 | parameters.batch_size, 97 | rng, 98 | train_policy=train_policy, 99 | test_policy=test_policy) 100 | 101 | # --- Create unique filename for FindBestController --- 102 | h = hash(vars(parameters), hash_name="sha1") 103 | fname = "ALE_" + h 104 | print("The parameters hash is: {}".format(h)) 105 | print("The parameters are: {}".format(parameters)) 106 | 107 | # --- Bind controllers to the agent --- 108 | # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 109 | # learning rate as well as the training epoch number. 110 | agent.attach(bc.VerboseController( 111 | evaluate_on='epoch', 112 | periodicity=1)) 113 | 114 | # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes. 115 | # Plus, we also want to display after each training episode (!= than after every training) the average bellman 116 | # residual and the average of the V values obtained during the last episode, hence the two last arguments. 117 | agent.attach(bc.TrainerController( 118 | evaluate_on='action', 119 | periodicity=parameters.update_frequency, 120 | show_episode_avg_V_value=True, 121 | show_avg_Bellman_residual=True)) 122 | 123 | # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 124 | # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given. 125 | agent.attach(bc.LearningRateController( 126 | initial_learning_rate=parameters.learning_rate, 127 | learning_rate_decay=parameters.learning_rate_decay, 128 | periodicity=1)) 129 | 130 | # Same for the discount factor. 131 | agent.attach(bc.DiscountFactorController( 132 | initial_discount_factor=parameters.discount, 133 | discount_factor_growth=parameters.discount_inc, 134 | discount_factor_max=parameters.discount_max, 135 | periodicity=1)) 136 | 137 | # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy 138 | # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more 139 | # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every 140 | # episode or epoch (or never, hence the resetEvery='none'). 141 | agent.attach(bc.EpsilonController( 142 | initial_e=parameters.epsilon_start, 143 | e_decays=parameters.epsilon_decay, 144 | e_min=parameters.epsilon_min, 145 | evaluate_on='action', 146 | periodicity=1, 147 | reset_every='none')) 148 | 149 | # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 150 | # seems to generalize the better, thus which one has the highest validation score. Here, we do not care about the 151 | # "true generalization score", or "test score". 152 | # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 153 | # important that the validationID is the same than the id argument of the InterleavedTestEpochController. 154 | # The FindBestController will dump on disk the validation scores for each and every network, as well as the 155 | # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 156 | # of the validation and test scores (see below) or simply recover the resulting neural network for your 157 | # application. 158 | # agent.attach(bc.FindBestController( 159 | # validationID=ALE_env.VALIDATION_MODE, 160 | # testID=None, 161 | # unique_fname=fname)) 162 | 163 | # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 164 | # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 165 | # these validation epoch to interfere with the training of the agent, which is well established by the 166 | # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 167 | # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 168 | # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 169 | # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 170 | # [parameters.period_btw_summary_perfs] *validation* epochs. 171 | agent.attach(bc.InterleavedTestEpochController( 172 | id=ALE_env.VALIDATION_MODE, 173 | epoch_length=parameters.steps_per_test, 174 | periodicity=1, 175 | show_score=True, 176 | summarize_every=1)) 177 | 178 | agent.attach(bc.InterleavedTestEpochController( 179 | id=ALE_env.VALIDATION_MODE+1, 180 | epoch_length=parameters.steps_per_test, 181 | periodicity=1, 182 | show_score=True, 183 | summarize_every=1)) 184 | 185 | agent.attach(bc.InterleavedTestEpochController( 186 | id=ALE_env.VALIDATION_MODE+2, 187 | epoch_length=parameters.steps_per_test, 188 | periodicity=1, 189 | show_score=True, 190 | summarize_every=1)) 191 | 192 | agent.attach(bc.InterleavedTestEpochController( 193 | id=ALE_env.VALIDATION_MODE+3, 194 | epoch_length=parameters.steps_per_test, 195 | periodicity=1, 196 | show_score=True, 197 | summarize_every=1)) 198 | 199 | 200 | # --- Run the experiment --- 201 | try: 202 | os.mkdir("params") 203 | except Exception: 204 | pass 205 | dump(vars(parameters), "params/" + fname + ".jldump") 206 | agent.run(parameters.epochs, parameters.steps_per_epoch) 207 | 208 | # --- Show results --- 209 | basename = "scores/" + fname 210 | scores = load(basename + "_scores.jldump") 211 | plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b') 212 | plt.legend() 213 | plt.xlabel("Number of epochs") 214 | plt.ylabel("Score") 215 | plt.savefig(basename + "_scores.pdf") 216 | plt.show() 217 | -------------------------------------------------------------------------------- /examples/MG_two_storages/data/BelgiumPV_prod_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/examples/MG_two_storages/data/BelgiumPV_prod_test.npy -------------------------------------------------------------------------------- /examples/MG_two_storages/data/BelgiumPV_prod_train.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/examples/MG_two_storages/data/BelgiumPV_prod_train.npy -------------------------------------------------------------------------------- /examples/MG_two_storages/data/example_nondeterminist_cons_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/examples/MG_two_storages/data/example_nondeterminist_cons_test.npy -------------------------------------------------------------------------------- /examples/MG_two_storages/data/example_nondeterminist_cons_train.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/examples/MG_two_storages/data/example_nondeterminist_cons_train.npy -------------------------------------------------------------------------------- /examples/MG_two_storages/data/spotmarket_data_2007-2013.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/examples/MG_two_storages/data/spotmarket_data_2007-2013.xls -------------------------------------------------------------------------------- /examples/MG_two_storages/plot_MG_operation.py: -------------------------------------------------------------------------------- 1 | from mpl_toolkits.axes_grid1 import host_subplot 2 | import mpl_toolkits.axisartist as AA 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | 7 | def plot_op(actions, consumption,production,rewards,battery_level, plot_name): 8 | #### 9 | # PLOT 10 | #### 11 | 12 | print ( "In this plot, total score"+str(np.sum(rewards)) ) 13 | print ( "H:"+str( np.sum(actions)*0.1 ) ) 14 | print ( "-:"+str( np.sum(rewards)-np.sum(actions)*0.1 ) ) 15 | 16 | print ("battery_level") 17 | print (battery_level) 18 | print ("actions") 19 | print (actions) 20 | print ("consumption") 21 | print (consumption) 22 | print ("production") 23 | print (production) 24 | print ("rewards") 25 | print (rewards) 26 | 27 | steps=np.arange(100) 28 | print (steps) 29 | print ("battery_level") 30 | print (battery_level) 31 | print (consumption) 32 | print (production) 33 | print ("demand:") 34 | print (consumption-production) 35 | 36 | steps_long=np.arange(1000)/10. 37 | 38 | host = host_subplot(111, axes_class=AA.Axes) 39 | plt.subplots_adjust(left=0.2, right=0.8) 40 | 41 | par1 = host.twinx() 42 | par2 = host.twinx() 43 | par3 = host.twinx() 44 | 45 | offset = 60 46 | new_fixed_axis = par2.get_grid_helper().new_fixed_axis 47 | par2.axis["right"] = new_fixed_axis(loc="right", 48 | axes=par2, 49 | offset=(offset, 0)) 50 | par2.axis["right"].toggle(all=True) 51 | 52 | offset = -60 53 | new_fixed_axis = par3.get_grid_helper().new_fixed_axis 54 | par3.axis["right"] = new_fixed_axis(loc="left", 55 | axes=par3, 56 | offset=(offset, 0)) 57 | par3.axis["right"].toggle(all=True) 58 | 59 | 60 | host.set_xlim(-0.9, 99) 61 | host.set_ylim(0, 20.9) 62 | 63 | host.set_xlabel("Time (h)") 64 | host.set_ylabel("Battery level (kWh)") 65 | par1.set_ylabel("Consumption (kW)") 66 | par2.set_ylabel("Production (kW)") 67 | par3.set_ylabel("H Actions (kW)") 68 | 69 | p1, = host.plot(steps, battery_level, marker='o', lw=1, c = 'b', alpha=0.8, ls='-', label = 'Battery level') 70 | p2, = par1.plot(steps_long-0.9, np.repeat(consumption,10), lw=3, c = 'r', alpha=0.5, ls='-', label = 'Consumption') 71 | p3, = par2.plot(steps_long-0.9, np.repeat(production,10), lw=3, c = 'g', alpha=0.5, ls='-', label = 'Production') 72 | p4, = par3.plot(steps_long, np.repeat(actions,10), lw=3, c = 'c', alpha=0.5, ls='-', label = 'H Actions') 73 | 74 | par1.set_ylim(0, 10.09) 75 | par2.set_ylim(0, 10.09) 76 | par3.set_ylim(-1.5, 1.5) 77 | 78 | #host.legend(loc=2)#loc=9) 79 | 80 | host.axis["left"].label.set_color(p1.get_color()) 81 | par1.axis["right"].label.set_color(p2.get_color()) 82 | par2.axis["right"].label.set_color(p3.get_color()) 83 | par3.axis["right"].label.set_color(p4.get_color()) 84 | 85 | plt.savefig(plot_name) 86 | 87 | #plt.draw() 88 | #plt.show() 89 | #plt.close('all') 90 | -------------------------------------------------------------------------------- /examples/MG_two_storages/run_MG_two_storages.py: -------------------------------------------------------------------------------- 1 | """2-Storage Microgrid launcher. See the docs for more details about this experiment. 2 | 3 | """ 4 | 5 | import sys 6 | import logging 7 | import numpy as np 8 | from joblib import hash, dump, load 9 | import os 10 | import matplotlib.pyplot as plt 11 | 12 | import sys 13 | from os import path 14 | sys.path.append( path.dirname( path.dirname( path.abspath(__file__) ) ) ) 15 | 16 | from deer.default_parser import process_args 17 | from deer.agent import NeuralAgent 18 | from deer.learning_algos.q_net_keras import MyQNetwork 19 | from MG_two_storages_env import MyEnv as MG_two_storages_env 20 | import deer.experiment.base_controllers as bc 21 | 22 | class Defaults: 23 | # ---------------------- 24 | # Experiment Parameters 25 | # ---------------------- 26 | STEPS_PER_EPOCH = 365*24-1 27 | EPOCHS = 200 28 | STEPS_PER_TEST = 365*24-1 29 | PERIOD_BTW_SUMMARY_PERFS = -1 # Set to -1 for avoiding call to env.summarizePerformance 30 | 31 | # ---------------------- 32 | # Environment Parameters 33 | # ---------------------- 34 | FRAME_SKIP = 1 35 | 36 | # ---------------------- 37 | # DQN Agent parameters: 38 | # ---------------------- 39 | UPDATE_RULE = 'rmsprop' 40 | LEARNING_RATE = 0.0002 41 | LEARNING_RATE_DECAY = 0.99 42 | DISCOUNT = 0.9 43 | DISCOUNT_INC = 0.99 44 | DISCOUNT_MAX = 0.98 45 | RMS_DECAY = 0.9 46 | RMS_EPSILON = 0.0001 47 | MOMENTUM = 0 48 | CLIP_NORM = 1.0 49 | EPSILON_START = 1.0 50 | EPSILON_MIN = .3 51 | EPSILON_DECAY = 500000 52 | UPDATE_FREQUENCY = 1 53 | REPLAY_MEMORY_SIZE = 1000000 54 | BATCH_SIZE = 32 55 | FREEZE_INTERVAL = 1000 56 | DETERMINISTIC = False 57 | 58 | 59 | 60 | 61 | if __name__ == "__main__": 62 | logging.basicConfig(level=logging.INFO) 63 | 64 | # --- Parse parameters --- 65 | parameters = process_args(sys.argv[1:], Defaults) 66 | 67 | if parameters.deterministic: 68 | rng = np.random.RandomState(123456) 69 | else: 70 | rng = np.random.RandomState() 71 | 72 | if(parameters.param1 is not None and parameters.param1!="1"): 73 | # We Reduce the size of the time series so that the number of days is divisible by 4*parameters.param1 74 | # That way, the number of days in each season is divisible by parameters.param1 and it is thus possible 75 | # to reduce the variety of the data within each season in the time series by a factor of parameters.param1 76 | parameters.steps_per_epoch=parameters.steps_per_epoch-(parameters.steps_per_epoch%(24*4*int(parameters.param1)))-1 77 | 78 | # --- Instantiate environment --- 79 | env = MG_two_storages_env(rng, parameters.param1, parameters.param2, parameters.param3) 80 | 81 | # --- Instantiate qnetwork --- 82 | qnetwork = MyQNetwork( 83 | env, 84 | parameters.rms_decay, 85 | parameters.rms_epsilon, 86 | parameters.momentum, 87 | parameters.clip_norm, 88 | parameters.freeze_interval, 89 | parameters.batch_size, 90 | parameters.update_rule, 91 | rng) 92 | 93 | # --- Instantiate agent --- 94 | agent = NeuralAgent( 95 | env, 96 | qnetwork, 97 | parameters.replay_memory_size, 98 | max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), 99 | parameters.batch_size, 100 | rng) 101 | 102 | # --- Create unique filename for FindBestController --- 103 | h = hash(vars(parameters), hash_name="sha1") 104 | fname = "MG2S_" + h 105 | print("The parameters hash is: {}".format(h)) 106 | print("The parameters are: {}".format(parameters)) 107 | 108 | # --- Bind controllers to the agent --- 109 | # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 110 | # learning rate as well as the training epoch number. 111 | agent.attach(bc.VerboseController( 112 | evaluate_on='epoch', 113 | periodicity=1)) 114 | 115 | # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes. 116 | # Plus, we also want to display after each training episode (!= than after every training) the average bellman 117 | # residual and the average of the V values obtained during the last episode, hence the two last arguments. 118 | agent.attach(bc.TrainerController( 119 | evaluate_on='action', 120 | periodicity=parameters.update_frequency, 121 | show_episode_avg_V_value=True, 122 | show_avg_Bellman_residual=True)) 123 | 124 | # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 125 | # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given. 126 | agent.attach(bc.LearningRateController( 127 | initial_learning_rate=parameters.learning_rate, 128 | learning_rate_decay=parameters.learning_rate_decay, 129 | periodicity=1)) 130 | 131 | # Same for the discount factor. 132 | agent.attach(bc.DiscountFactorController( 133 | initial_discount_factor=parameters.discount, 134 | discount_factor_growth=parameters.discount_inc, 135 | discount_factor_max=parameters.discount_max, 136 | periodicity=1)) 137 | 138 | # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy 139 | # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more 140 | # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every 141 | # episode or epoch (or never, hence the resetEvery='none'). 142 | agent.attach(bc.EpsilonController( 143 | initial_e=parameters.epsilon_start, 144 | e_decays=parameters.epsilon_decay, 145 | e_min=parameters.epsilon_min, 146 | evaluate_on='action', 147 | periodicity=1, 148 | reset_every='none')) 149 | 150 | # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 151 | # seems to generalize the best, thus which one has the highest validation score. However we also want to keep 152 | # track of a "true generalization score", the "test score". Indeed, what if we overfit the validation score ? 153 | # To achieve these goals, one can use the FindBestController along two InterleavedTestEpochControllers, one for 154 | # each mode (validation and test). It is important that the validationID and testID are the same than the id 155 | # argument of the two InterleavedTestEpochControllers (implementing the validation mode and test mode 156 | # respectively). The FindBestController will dump on disk the validation and test scores for each and every 157 | # network, as well as the structure of the neural network having the best validation score. These dumps can then 158 | # used to plot the evolution of the validation and test scores (see below) or simply recover the resulting neural 159 | # network for your application. 160 | agent.attach(bc.FindBestController( 161 | validationID=env.VALIDATION_MODE, 162 | testID=env.TEST_MODE, 163 | unique_fname=fname)) 164 | 165 | # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 166 | # "validation epoch" between each training epoch (hence the periodicity=1). For each validation epoch, we want also 167 | # to display the sum of all rewards obtained, hence the showScore=True. Finally, we never want this controller to 168 | # call the summarizePerformance method of MG_two_storage_env. 169 | agent.attach(bc.InterleavedTestEpochController( 170 | id=env.VALIDATION_MODE, 171 | epoch_length=parameters.steps_per_epoch, 172 | periodicity=1, 173 | show_score=True, 174 | summarize_every=-1)) 175 | 176 | # Besides inserting a validation epoch (required if one wants to find the best neural network over all training 177 | # epochs), we also wish to interleave a "test epoch" between each training epoch. For each test epoch, we also 178 | # want to display the sum of all rewards obtained, hence the showScore=True. Finally, we want to call the 179 | # summarizePerformance method of MG_two_storage_env every [parameters.period_btw_summary_perfs] *test* epochs. 180 | agent.attach(bc.InterleavedTestEpochController( 181 | id=env.TEST_MODE, 182 | epoch_length=parameters.steps_per_test, 183 | periodicity=1, 184 | show_score=True, 185 | summarize_every=parameters.period_btw_summary_perfs)) 186 | 187 | # --- Run the experiment --- 188 | try: 189 | os.mkdir("params") 190 | except Exception: 191 | pass 192 | dump(vars(parameters), "params/" + fname + ".jldump") 193 | 194 | agent.run(parameters.epochs, parameters.steps_per_epoch) 195 | 196 | # --- Show results --- 197 | basename = "scores/" + fname 198 | scores = load(basename + "_scores.jldump") 199 | plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b') 200 | plt.plot(range(1, len(scores['ts'])+1), scores['ts'], label="TS", color='r') 201 | plt.legend() 202 | plt.xlabel("Number of epochs") 203 | plt.ylabel("Score") 204 | plt.savefig(basename + "_scores.pdf") 205 | plt.show() 206 | -------------------------------------------------------------------------------- /examples/gym/mountain_car_continuous_env.py: -------------------------------------------------------------------------------- 1 | """ Mountain car environment with continuous action space. 2 | 3 | Author: Vincent Francois-Lavet 4 | """ 5 | 6 | import numpy as np 7 | import copy 8 | import math 9 | from deer.base_classes import Environment 10 | import gym 11 | 12 | class MyEnv(Environment): 13 | def __init__(self, rng): 14 | """ Initialize environment. 15 | 16 | Parameters 17 | ----------- 18 | rng : numpy random number generator 19 | """ 20 | self.env = gym.make('MountainCarContinuous-v0') 21 | self.rng=rng 22 | self._last_observation = self.reset() 23 | self.is_terminal=False 24 | self._input_dim = [(1,), (1,)] 25 | 26 | def act(self, action): 27 | """ Simulate one time step in the environment and returns the reward for the time step 28 | 29 | Parameters 30 | ----------- 31 | action : list of floats (in this case one float, because there is one action) 32 | 33 | Returns 34 | ------- 35 | reward : float 36 | """ 37 | reward=0 38 | for _ in range(10): # Increase the duration of one time step by a factor 10 39 | self._last_observation, r, self.is_terminal, info = self.env.step([action[0]]) 40 | reward+=r 41 | if(self.is_terminal==True): 42 | break 43 | 44 | if (self.mode==0): # Show the policy only at test time 45 | try: 46 | self.env.render() 47 | except: 48 | pass 49 | 50 | return reward/100. #Scale the reward so that it's 1 at maximum 51 | 52 | def reset(self, mode=0): 53 | """ Reset environment for a new episode. 54 | 55 | Parameters 56 | ----------- 57 | Mode : int 58 | -1 corresponds to training and 0 to test 59 | """ 60 | self.mode=mode 61 | 62 | self._last_observation = self.env.reset() 63 | 64 | self.is_terminal=False 65 | 66 | return self._last_observation 67 | 68 | def inTerminalState(self): 69 | """ This returns whether the environment reached a terminal state after the last transition 70 | (i.e. whether the last transition that occurred was terminal). 71 | 72 | Returns 73 | ------- 74 | self.is_terminal : bool 75 | """ 76 | return self.is_terminal 77 | 78 | def inputDimensions(self): 79 | return self._input_dim 80 | 81 | def nActions(self): 82 | """ Provides the bounds on the action space 83 | 84 | Returns 85 | ------- 86 | bounds on the action space 87 | """ 88 | return [[self.env.action_space.low[0],self.env.action_space.high[0]]] 89 | 90 | def observe(self): 91 | return copy.deepcopy(self._last_observation) 92 | 93 | def main(): 94 | # This function can be used for debug purposes 95 | rng = np.random.RandomState(123456) 96 | myenv=MyEnv(rng) 97 | print(myenv.env.action_space) 98 | print(myenv.env.action_space.low) 99 | print(myenv.env.action_space.high) 100 | print(myenv.env.observation_space) 101 | 102 | print (myenv.observe()) 103 | myenv.act([0]) 104 | print (myenv.observe()) 105 | myenv.act([1]) 106 | print (myenv.observe()) 107 | 108 | 109 | if __name__ == "__main__": 110 | main() 111 | -------------------------------------------------------------------------------- /examples/gym/mountain_car_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | import math 4 | from deer.base_classes import Environment 5 | import gym 6 | import sys 7 | 8 | class MyEnv(Environment): 9 | def __init__(self, rng): 10 | """ Initialize environment. 11 | 12 | Arguments: 13 | rng - the numpy random number generator 14 | """ 15 | gym.envs.register( 16 | id='MountainCarModified-v0', 17 | entry_point='gym.envs.classic_control:MountainCarEnv', 18 | max_episode_steps=500, # MountainCar-v0 uses 200 19 | reward_threshold=-110.0, 20 | ) 21 | 22 | self.env = gym.make('MountainCarModified-v0') 23 | self.env.max_episode_steps = 500 24 | self.rng=rng 25 | self._last_observation = self.env.reset() 26 | self.is_terminal=False 27 | self._input_dim = [(1,), (1,)] # self.env.observation_space.shape is equal to 2 28 | # and we use only the current observation in the pseudo-state 29 | 30 | def act(self, action): 31 | """ Simulate one time step in the environment. 32 | """ 33 | reward=0 34 | nsteps=10 35 | for _ in range(nsteps): 36 | self._last_observation, r, self.is_terminal, info = self.env.step(action) 37 | reward+=r 38 | if(self.is_terminal==True): 39 | reward+=3*nsteps 40 | break 41 | 42 | if (self.mode==0): # Show the policy only at test time 43 | try: 44 | self.env.render() 45 | except: 46 | pass 47 | #print("Warning:", sys.exc_info()[0]) 48 | 49 | #s=copy.deepcopy(self._last_observation) 50 | ## Possibility to add a reward shaping for faster convergence 51 | #s[0]+=math.pi/6 52 | #if(s[0]>0): 53 | # reward+=pow(s[0],2)#np.linalg.norm(s[0]) 54 | 55 | return reward/nsteps 56 | 57 | def reset(self, mode=0): 58 | """ Reset environment for a new episode. 59 | 60 | Arguments: 61 | Mode : int 62 | -1 corresponds to training and 0 to test 63 | """ 64 | self.mode=mode 65 | 66 | self._last_observation = self.env.reset() 67 | # DEEPRECATED 68 | #if (self.mode==-1): # Reset to a random value when in training mode (that allows to increase exploration) 69 | # high=self.env.observation_space.high 70 | # low=self.env.observation_space.low 71 | # self._last_observation=low+self.rng.rand(2)*(high-low) 72 | # self.env.env.state=self._last_observation 73 | 74 | self.is_terminal=False 75 | 76 | 77 | return self._last_observation 78 | 79 | def inTerminalState(self): 80 | """ Tell whether the environment reached a terminal state after the last transition (i.e. the last transition 81 | that occured was terminal). 82 | """ 83 | return self.is_terminal 84 | 85 | def inputDimensions(self): 86 | return self._input_dim 87 | 88 | def nActions(self): 89 | return 3 #Would be useful to have this directly in gym : self.env.action_space.shape 90 | 91 | def observe(self): 92 | return copy.deepcopy(self._last_observation) 93 | 94 | def main(): 95 | # This function can be used for debug purposes 96 | rng = np.random.RandomState(123456) 97 | myenv=MyEnv(rng) 98 | 99 | print (myenv.observe()) 100 | 101 | if __name__ == "__main__": 102 | main() 103 | -------------------------------------------------------------------------------- /examples/gym/pendulum_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | from deer.base_classes import Environment 5 | import gym 6 | 7 | class MyEnv(Environment): 8 | def __init__(self, rng): 9 | """ Initialize environment. 10 | 11 | Arguments: 12 | rng - the numpy random number generator 13 | """ 14 | # Defining the type of environment 15 | self.env = gym.make('CartPole-v0') 16 | self._last_observation = self.env.reset() 17 | self.is_terminal=False 18 | self._input_dim = [(1,), (1,), (1,), (1,)] # self.env.observation_space.shape is equal to 4 19 | # and we use only the current observations in the pseudo-state 20 | 21 | def act(self, action): 22 | """ Simulate one time step in the environment. 23 | """ 24 | 25 | self._last_observation, reward, self.is_terminal, info = self.env.step(action) 26 | if (self.mode==0): # Show the policy only at test time 27 | self.env.render() 28 | 29 | return reward 30 | 31 | def reset(self, mode=0): 32 | """ Reset environment for a new episode. 33 | 34 | Arguments: 35 | Mode : int 36 | -1 corresponds to training and 0 to test 37 | """ 38 | # Reset initial observation to a random x and theta 39 | self._last_observation = self.env.reset() 40 | self.is_terminal=False 41 | self.mode=mode 42 | 43 | return self._last_observation 44 | 45 | def inTerminalState(self): 46 | """Tell whether the environment reached a terminal state after the last transition (i.e. the last transition 47 | that occured was terminal). 48 | """ 49 | return self.is_terminal 50 | 51 | def inputDimensions(self): 52 | return self._input_dim 53 | 54 | def nActions(self): 55 | return 2 #Would be useful to have this directly in gym : self.env.action_space.shape 56 | 57 | def observe(self): 58 | return copy.deepcopy(self._last_observation) 59 | 60 | def main(): 61 | rng = np.random.RandomState(123456) 62 | myenv=MyEnv(rng) 63 | 64 | print (myenv.observe()) 65 | 66 | if __name__ == "__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /examples/gym/run_mountain_car.py: -------------------------------------------------------------------------------- 1 | """ Mountain car environment launcher. 2 | Same principles as run_toy_env. See the docs for more details. 3 | 4 | """ 5 | 6 | import sys 7 | import logging 8 | import numpy as np 9 | 10 | import deer.experiment.base_controllers as bc 11 | from deer.default_parser import process_args 12 | from deer.agent import NeuralAgent 13 | from deer.learning_algos.q_net_keras import MyQNetwork 14 | from mountain_car_env import MyEnv as mountain_car_env 15 | from deer.policies import EpsilonGreedyPolicy,LongerExplorationPolicy 16 | 17 | class Defaults: 18 | # ---------------------- 19 | # Experiment Parameters 20 | # ---------------------- 21 | STEPS_PER_EPOCH = 200 22 | EPOCHS = 200 23 | STEPS_PER_TEST = 200 24 | PERIOD_BTW_SUMMARY_PERFS = 10 25 | 26 | # ---------------------- 27 | # Environment Parameters 28 | # ---------------------- 29 | FRAME_SKIP = 1 30 | 31 | # ---------------------- 32 | # DQN Agent parameters: 33 | # ---------------------- 34 | UPDATE_RULE = 'rmsprop' 35 | LEARNING_RATE = 0.005 36 | LEARNING_RATE_DECAY = 0.99 37 | DISCOUNT = 0.9 38 | DISCOUNT_INC = 0.99 39 | DISCOUNT_MAX = 0.95 40 | RMS_DECAY = 0.9 41 | RMS_EPSILON = 0.0001 42 | MOMENTUM = 0 43 | CLIP_NORM = 1.0 44 | EPSILON_START = 1.0 45 | EPSILON_MIN = 0.2 46 | EPSILON_DECAY = 10000 47 | UPDATE_FREQUENCY = 1 48 | REPLAY_MEMORY_SIZE = 1000000 49 | BATCH_SIZE = 32 50 | FREEZE_INTERVAL = 100 51 | DETERMINISTIC = True 52 | 53 | if __name__ == "__main__": 54 | logging.basicConfig(level=logging.INFO) 55 | 56 | # --- Parse parameters --- 57 | parameters = process_args(sys.argv[1:], Defaults) 58 | if parameters.deterministic: 59 | rng = np.random.RandomState(12345) 60 | else: 61 | rng = np.random.RandomState() 62 | 63 | # --- Instantiate environment --- 64 | env = mountain_car_env(rng) 65 | 66 | # --- Instantiate qnetwork --- 67 | qnetwork = MyQNetwork( 68 | env, 69 | parameters.rms_decay, 70 | parameters.rms_epsilon, 71 | parameters.momentum, 72 | parameters.clip_norm, 73 | parameters.freeze_interval, 74 | parameters.batch_size, 75 | parameters.update_rule, 76 | rng, 77 | double_Q=True) 78 | 79 | train_policy = LongerExplorationPolicy(qnetwork, env.nActions(), rng, 1.0)#EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.) 80 | test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.) 81 | 82 | # --- Instantiate agent --- 83 | agent = NeuralAgent( 84 | env, 85 | qnetwork, 86 | parameters.replay_memory_size, 87 | max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), 88 | parameters.batch_size, 89 | rng, 90 | exp_priority=1., 91 | train_policy=train_policy, 92 | test_policy=test_policy) 93 | 94 | # --- Bind controllers to the agent --- 95 | # For comments, please refer to run_toy_env.py 96 | agent.attach(bc.VerboseController( 97 | evaluate_on='epoch', 98 | periodicity=1)) 99 | 100 | agent.attach(bc.TrainerController( 101 | evaluate_on='action', 102 | periodicity=parameters.update_frequency, 103 | show_episode_avg_V_value=True, 104 | show_avg_Bellman_residual=True)) 105 | 106 | agent.attach(bc.LearningRateController( 107 | initial_learning_rate=parameters.learning_rate, 108 | learning_rate_decay=parameters.learning_rate_decay, 109 | periodicity=1)) 110 | 111 | agent.attach(bc.DiscountFactorController( 112 | initial_discount_factor=parameters.discount, 113 | discount_factor_growth=parameters.discount_inc, 114 | discount_factor_max=parameters.discount_max, 115 | periodicity=1)) 116 | 117 | agent.attach(bc.EpsilonController( 118 | initial_e=parameters.epsilon_start, 119 | e_decays=parameters.epsilon_decay, 120 | e_min=parameters.epsilon_min, 121 | evaluate_on='action', 122 | periodicity=1, 123 | reset_every='none')) 124 | 125 | agent.attach(bc.InterleavedTestEpochController( 126 | id=0, 127 | epoch_length=parameters.steps_per_test, 128 | periodicity=1, 129 | show_score=True, 130 | summarize_every=parameters.period_btw_summary_perfs)) 131 | 132 | # --- Run the experiment --- 133 | agent.run(parameters.epochs, parameters.steps_per_epoch) 134 | -------------------------------------------------------------------------------- /examples/gym/run_mountain_car_continuous.py: -------------------------------------------------------------------------------- 1 | """ Launcher for mountain car environment with continuous action space. 2 | Same principles as run_toy_env. See the wiki for more details. 3 | 4 | """ 5 | 6 | import sys 7 | import logging 8 | import numpy as np 9 | 10 | import deer.experiment.base_controllers as bc 11 | from deer.default_parser import process_args 12 | from deer.agent import NeuralAgent 13 | from deer.learning_algos.AC_net_keras import MyACNetwork 14 | from mountain_car_continuous_env import MyEnv as mountain_car_continuous_env 15 | from deer.policies import LongerExplorationPolicy 16 | 17 | 18 | class Defaults: 19 | # ---------------------- 20 | # Experiment Parameters 21 | # ---------------------- 22 | STEPS_PER_EPOCH = 200 23 | EPOCHS = 200 24 | STEPS_PER_TEST = 200 25 | PERIOD_BTW_SUMMARY_PERFS = 10 26 | 27 | # ---------------------- 28 | # Environment Parameters 29 | # ---------------------- 30 | FRAME_SKIP = 1 31 | 32 | # ---------------------- 33 | # DQN Agent parameters: 34 | # ---------------------- 35 | UPDATE_RULE = 'rmsprop' 36 | LEARNING_RATE = 0.002 37 | LEARNING_RATE_DECAY = 0.99 38 | DISCOUNT = 0.9 39 | DISCOUNT_INC = 0.99 40 | DISCOUNT_MAX = 0.95 41 | RMS_DECAY = 0.9 42 | RMS_EPSILON = 0.0001 43 | MOMENTUM = 0 44 | CLIP_NORM = 1.0 45 | EPSILON_START = 1.0 46 | EPSILON_MIN = 0.2 47 | EPSILON_DECAY = 10000 48 | UPDATE_FREQUENCY = 1 49 | REPLAY_MEMORY_SIZE = 1000000 50 | BATCH_SIZE = 32 51 | FREEZE_INTERVAL = 100 52 | DETERMINISTIC = True 53 | 54 | if __name__ == "__main__": 55 | logging.basicConfig(level=logging.INFO) 56 | 57 | # --- Parse parameters --- 58 | parameters = process_args(sys.argv[1:], Defaults) 59 | if parameters.deterministic: 60 | rng = np.random.RandomState(12345) 61 | else: 62 | rng = np.random.RandomState() 63 | 64 | # --- Instantiate environment --- 65 | env = mountain_car_continuous_env(rng) 66 | 67 | # --- Instantiate qnetwork --- 68 | qnetwork = MyACNetwork( 69 | env, 70 | parameters.rms_decay, 71 | parameters.rms_epsilon, 72 | parameters.momentum, 73 | parameters.clip_norm, 74 | parameters.freeze_interval, 75 | parameters.batch_size, 76 | parameters.update_rule, 77 | rng) 78 | 79 | train_policy=LongerExplorationPolicy(qnetwork, env.nActions(), rng, 1.,10) 80 | 81 | # --- Instantiate agent --- 82 | agent = NeuralAgent( 83 | env, 84 | qnetwork, 85 | parameters.replay_memory_size, 86 | max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), 87 | parameters.batch_size, 88 | rng, 89 | exp_priority=1., 90 | train_policy=train_policy) 91 | 92 | # --- Bind controllers to the agent --- 93 | # For comments, please refer to run_toy_env.py 94 | agent.attach(bc.VerboseController( 95 | evaluate_on='epoch', 96 | periodicity=1)) 97 | 98 | agent.attach(bc.TrainerController( 99 | evaluate_on='action', 100 | periodicity=parameters.update_frequency, 101 | show_episode_avg_V_value=True, 102 | show_avg_Bellman_residual=True)) 103 | 104 | agent.attach(bc.LearningRateController( 105 | initial_learning_rate=parameters.learning_rate, 106 | learning_rate_decay=parameters.learning_rate_decay, 107 | periodicity=1)) 108 | 109 | agent.attach(bc.DiscountFactorController( 110 | initial_discount_factor=parameters.discount, 111 | discount_factor_growth=parameters.discount_inc, 112 | discount_factor_max=parameters.discount_max, 113 | periodicity=1)) 114 | 115 | agent.attach(bc.EpsilonController( 116 | initial_e=parameters.epsilon_start, 117 | e_decays=parameters.epsilon_decay, 118 | e_min=parameters.epsilon_min, 119 | evaluate_on='action', 120 | periodicity=1, 121 | reset_every='none')) 122 | 123 | agent.attach(bc.InterleavedTestEpochController( 124 | id=0, 125 | epoch_length=parameters.steps_per_test, 126 | periodicity=1, 127 | show_score=True, 128 | summarize_every=parameters.period_btw_summary_perfs)) 129 | 130 | # --- Run the experiment --- 131 | agent.run(parameters.epochs, parameters.steps_per_epoch) 132 | -------------------------------------------------------------------------------- /examples/gym/run_pendulum.py: -------------------------------------------------------------------------------- 1 | """ Pendulum environment launcher. 2 | Same principles as run_toy_env. See the docs for more details. 3 | 4 | Authors: Vincent Francois-Lavet, David Taralla 5 | """ 6 | 7 | import sys 8 | import logging 9 | import numpy as np 10 | 11 | import deer.experiment.base_controllers as bc 12 | from deer.default_parser import process_args 13 | from deer.agent import NeuralAgent 14 | from deer.learning_algos.q_net_keras import MyQNetwork 15 | from pendulum_env import MyEnv as pendulum_env 16 | 17 | class Defaults: 18 | # ---------------------- 19 | # Experiment Parameters 20 | # ---------------------- 21 | STEPS_PER_EPOCH = 100 22 | EPOCHS = 200 23 | STEPS_PER_TEST = 100 24 | PERIOD_BTW_SUMMARY_PERFS = 10 25 | 26 | # ---------------------- 27 | # Environment Parameters 28 | # ---------------------- 29 | FRAME_SKIP = 1 30 | 31 | # ---------------------- 32 | # DQN Agent parameters: 33 | # ---------------------- 34 | UPDATE_RULE = 'rmsprop' 35 | LEARNING_RATE = 0.0002 36 | LEARNING_RATE_DECAY = 0.99 37 | DISCOUNT = 0.9 38 | DISCOUNT_INC = 1. 39 | DISCOUNT_MAX = 0.95 40 | RMS_DECAY = 0.9 41 | RMS_EPSILON = 0.0001 42 | MOMENTUM = 0 43 | CLIP_NORM = 1.0 44 | EPSILON_START = 1.0 45 | EPSILON_MIN = 0.2 46 | EPSILON_DECAY = 10000 47 | UPDATE_FREQUENCY = 1 48 | REPLAY_MEMORY_SIZE = 1000000 49 | BATCH_SIZE = 32 50 | FREEZE_INTERVAL = 500 51 | DETERMINISTIC = True 52 | 53 | if __name__ == "__main__": 54 | logging.basicConfig(level=logging.INFO) 55 | 56 | # --- Parse parameters --- 57 | parameters = process_args(sys.argv[1:], Defaults) 58 | if parameters.deterministic: 59 | rng = np.random.RandomState(12345) 60 | else: 61 | rng = np.random.RandomState() 62 | 63 | # --- Instantiate environment --- 64 | env = pendulum_env(rng) 65 | 66 | # --- Instantiate qnetwork --- 67 | qnetwork = MyQNetwork( 68 | env, 69 | parameters.rms_decay, 70 | parameters.rms_epsilon, 71 | parameters.momentum, 72 | parameters.clip_norm, 73 | parameters.freeze_interval, 74 | parameters.batch_size, 75 | parameters.update_rule, 76 | rng, 77 | double_Q=True) 78 | 79 | # --- Instantiate agent --- 80 | agent = NeuralAgent( 81 | env, 82 | qnetwork, 83 | parameters.replay_memory_size, 84 | max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), 85 | parameters.batch_size, 86 | rng) 87 | 88 | # --- Bind controllers to the agent --- 89 | # For comments, please refer to run_toy_env.py 90 | agent.attach(bc.VerboseController( 91 | evaluate_on='epoch', 92 | periodicity=1)) 93 | 94 | agent.attach(bc.TrainerController( 95 | evaluate_on='action', 96 | periodicity=parameters.update_frequency, 97 | show_episode_avg_V_value=False, 98 | show_avg_Bellman_residual=False)) 99 | 100 | agent.attach(bc.LearningRateController( 101 | initial_learning_rate=parameters.learning_rate, 102 | learning_rate_decay=parameters.learning_rate_decay, 103 | periodicity=1)) 104 | 105 | agent.attach(bc.DiscountFactorController( 106 | initial_discount_factor=parameters.discount, 107 | discount_factor_growth=parameters.discount_inc, 108 | discount_factor_max=parameters.discount_max, 109 | periodicity=1)) 110 | 111 | agent.attach(bc.EpsilonController( 112 | initial_e=parameters.epsilon_start, 113 | e_decays=parameters.epsilon_decay, 114 | e_min=parameters.epsilon_min, 115 | evaluate_on='action', 116 | periodicity=1, 117 | reset_every='none')) 118 | 119 | agent.attach(bc.InterleavedTestEpochController( 120 | id=0, 121 | epoch_length=parameters.steps_per_test, 122 | periodicity=1, 123 | show_score=True, 124 | summarize_every=parameters.period_btw_summary_perfs)) 125 | 126 | # --- Run the experiment --- 127 | agent.run(parameters.epochs, parameters.steps_per_epoch) 128 | -------------------------------------------------------------------------------- /examples/maze/a_star_path_finding.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code from https://github.com/laurentluce/python-algorithms/blob/master/algorithms/a_star_path_finding.py 3 | """ 4 | 5 | import heapq 6 | import numpy as np 7 | 8 | class Cell(object): 9 | def __init__(self, x, y, reachable): 10 | """Initialize new cell. 11 | 12 | @param reachable is cell reachable? not a wall? 13 | @param x cell x coordinate 14 | @param y cell y coordinate 15 | @param g cost to move from the starting cell to this cell. 16 | @param h estimation of the cost to move from this cell 17 | to the ending cell. 18 | @param f f = g + h 19 | """ 20 | self.reachable = reachable 21 | self.x = x 22 | self.y = y 23 | self.parent = None 24 | self.g = 0 25 | self.h = 0 26 | self.f = 0 27 | 28 | def __lt__(self, other): 29 | return self.f < other.f 30 | 31 | #class Cell(object): 32 | # def __init__(self, x, y, reachable): 33 | # """Initialize new cell. 34 | # @param reachable is cell reachable? not a wall? 35 | # @param x cell x coordinate 36 | # @param y cell y coordinate 37 | # @param g cost to move from the starting cell to this cell. 38 | # @param h estimation of the cost to move from this cell 39 | # to the ending cell. 40 | # @param f f = g + h 41 | # """ 42 | # self.reachable = reachable 43 | ## self.occupied = False 44 | # self.x = x 45 | # self.y = y 46 | # self.parent = None 47 | # self.g = 0 48 | # self.h = 0 49 | # self.f = 0 50 | 51 | 52 | class AStar(object): 53 | def __init__(self): 54 | # open list 55 | self.opened = [] 56 | heapq.heapify(self.opened) 57 | # visited cells list 58 | self.closed = set() 59 | # grid cells 60 | self.cells = [] 61 | self.grid_height = None 62 | self.grid_width = None 63 | 64 | def init_grid(self, width, height, walls, start, end): 65 | """Prepare grid cells, walls. 66 | @param width grid's width. 67 | @param height grid's height. 68 | @param walls list of wall x,y tuples. 69 | @param start grid starting point x,y tuple. 70 | @param end grid ending point x,y tuple. 71 | """ 72 | self.grid_height = height 73 | self.grid_width = width 74 | for x in range(self.grid_width): 75 | for y in range(self.grid_height): 76 | if (x, y) in walls: 77 | reachable = False 78 | else: 79 | reachable = True 80 | self.cells.append(Cell(x, y, reachable)) 81 | self.start = self.get_cell(*start) 82 | self.start.reachable=True 83 | 84 | self.agent_cell=self.start 85 | self.end = self.get_cell(*end) 86 | 87 | def get_heuristic(self, cell): 88 | """Compute the heuristic value H for a cell. 89 | Distance between this cell and the ending cell multiply by 10. 90 | @returns heuristic value H 91 | """ 92 | return 10 * (abs(cell.x - self.end.x) + abs(cell.y - self.end.y)) 93 | 94 | def get_cell(self, x, y): 95 | """Returns a cell from the cells list. 96 | @param x cell x coordinate 97 | @param y cell y coordinate 98 | @returns cell 99 | """ 100 | return self.cells[x * self.grid_height + y] 101 | 102 | def get_adjacent_cells(self, cell): 103 | """Returns adjacent cells to a cell. 104 | Clockwise starting from the one on the right. 105 | @param cell get adjacent cells for this cell 106 | @returns adjacent cells list. 107 | """ 108 | cells = [] 109 | if cell.x < self.grid_width-1: 110 | cells.append(self.get_cell(cell.x+1, cell.y)) 111 | if cell.y > 0: 112 | cells.append(self.get_cell(cell.x, cell.y-1)) 113 | if cell.x > 0: 114 | cells.append(self.get_cell(cell.x-1, cell.y)) 115 | if cell.y < self.grid_height-1: 116 | cells.append(self.get_cell(cell.x, cell.y+1)) 117 | return cells 118 | 119 | def get_path(self): 120 | cell = self.end 121 | path = [(cell.x, cell.y)] 122 | while cell.parent is not self.start: 123 | cell = cell.parent 124 | path.append((cell.x, cell.y)) 125 | 126 | path.append((self.start.x, self.start.y)) 127 | path.reverse() 128 | return path 129 | 130 | def update_cell(self, adj, cell): 131 | """Update adjacent cell. 132 | @param adj adjacent cell to current cell 133 | @param cell current cell being processed 134 | """ 135 | adj.g = cell.g + 10 136 | adj.h = self.get_heuristic(adj) 137 | adj.parent = cell 138 | adj.f = adj.h + adj.g 139 | 140 | def solve(self): 141 | """Solve maze, find path to ending cell. 142 | @returns path or None if not found. 143 | """ 144 | # add starting cell to open heap queue 145 | heapq.heappush(self.opened, (self.start.f, self.start)) 146 | while len(self.opened): 147 | # pop cell from heap queue 148 | f, cell = heapq.heappop(self.opened) 149 | # add cell to closed list so we don't process it twice 150 | self.closed.add(cell) 151 | # if ending cell, return found path 152 | if cell is self.end: 153 | return self.get_path() 154 | # get adjacent cells for cell 155 | adj_cells = self.get_adjacent_cells(cell) 156 | for adj_cell in adj_cells: 157 | if adj_cell.reachable and adj_cell not in self.closed: 158 | if (adj_cell.f, adj_cell) in self.opened: 159 | # if adj cell in open list, check if current path is 160 | # better than the one previously found 161 | # for this adj cell. 162 | if adj_cell.g > cell.g + 10: 163 | self.update_cell(adj_cell, cell) 164 | else: 165 | self.update_cell(adj_cell, cell) 166 | # add adj cell to open list 167 | heapq.heappush(self.opened, (adj_cell.f, adj_cell)) 168 | 169 | def get_maze_array(self): 170 | maze=[] 171 | for i in range(self.grid_height): 172 | row=[] 173 | for j in range(self.grid_width): 174 | if(self.get_cell(i, j) is self.agent_cell): 175 | row.append(1) 176 | elif(self.get_cell(i, j) is self.end): 177 | row.append(2) 178 | elif(self.get_cell(i, j).reachable==True): 179 | row.append(0) 180 | else: 181 | row.append(-1) 182 | maze.append(row) 183 | 184 | return maze -------------------------------------------------------------------------------- /examples/maze/maze_env.py: -------------------------------------------------------------------------------- 1 | """ Environment with a distribution of mazes (one new maze is drawn at each episode) 2 | 3 | Author: Vincent Francois-Lavet 4 | """ 5 | import numpy as np 6 | 7 | from deer.base_classes import Environment 8 | 9 | #import matplotlib 10 | #matplotlib.use('qt5agg') 11 | #from mpl_toolkits.axes_grid1 import host_subplot 12 | #import mpl_toolkits.axisartist as AA 13 | #import matplotlib.pyplot as plt 14 | import copy 15 | import a_star_path_finding as pf 16 | 17 | class MyEnv(Environment): 18 | VALIDATION_MODE = 0 19 | 20 | def __init__(self, rng, **kwargs): 21 | 22 | self._random_state = rng 23 | self._mode = -1 24 | self._mode_score = 0.0 25 | self._mode_episode_count = 0 26 | self._episode_steps = 0 27 | self._actions = [0,1,2,3] 28 | self._size_maze = 8 29 | self._higher_dim_obs=kwargs.get('higher_dim_obs',False) 30 | self._reverse=kwargs.get('reverse',False) 31 | 32 | self._n_walls = int((self._size_maze-2)**2/3.)#int((self._size_maze)**2/3.) 33 | self._n_rewards = 3 34 | self.create_map() 35 | self.intern_dim=3 36 | 37 | def create_map(self): 38 | valid_map=False 39 | while valid_map==False: 40 | # Agent 41 | self._pos_agent=[1,1] 42 | 43 | # Walls 44 | self._pos_walls=[] 45 | for i in range(self._size_maze): 46 | self._pos_walls.append([i,0]) 47 | self._pos_walls.append([i,self._size_maze-1]) 48 | for j in range(self._size_maze-2): 49 | self._pos_walls.append([0,j+1]) 50 | self._pos_walls.append([self._size_maze-1,j+1]) 51 | 52 | n=0 53 | while n < self._n_walls: 54 | potential_wall=[self._random_state.randint(1,self._size_maze-2),self._random_state.randint(1,self._size_maze-2)] 55 | if(potential_wall not in self._pos_walls and potential_wall!=self._pos_agent): 56 | self._pos_walls.append(potential_wall) 57 | n+=1 58 | 59 | # Rewards 60 | #self._pos_rewards=[[self._size_maze-2,self._size_maze-2]] 61 | self._pos_rewards=[] 62 | n=0 63 | while n < self._n_rewards: 64 | potential_reward=[self._random_state.randint(1,self._size_maze-1),self._random_state.randint(1,self._size_maze-1)] 65 | if(potential_reward not in self._pos_rewards and potential_reward not in self._pos_walls and potential_reward!=self._pos_agent): 66 | self._pos_rewards.append(potential_reward) 67 | n+=1 68 | 69 | valid_map=self.is_valid_map(self._pos_agent,self._pos_walls,self._pos_rewards) 70 | 71 | 72 | def is_valid_map(self,pos_agent,pos_walls,pos_rewards): 73 | a = pf.AStar() 74 | pos_walls 75 | walls = [tuple(w) for w in pos_walls] 76 | start=tuple(pos_agent) 77 | for r in pos_rewards: 78 | end=tuple(r) 79 | a.init_grid(self._size_maze, self._size_maze, walls, start, end) 80 | maze=a 81 | optimal_path=maze.solve() 82 | if(optimal_path==None): 83 | return False 84 | 85 | return True 86 | 87 | def reset(self, mode): 88 | self._episode_steps = 0 89 | self._mode=mode 90 | self.create_map() 91 | 92 | if mode == MyEnv.VALIDATION_MODE: 93 | if self._mode != MyEnv.VALIDATION_MODE: 94 | self._mode = MyEnv.VALIDATION_MODE 95 | self._mode_score = 0.0 96 | self._mode_episode_count = 0 97 | 98 | else: 99 | self._mode_episode_count += 1 100 | 101 | return [1 * [self._size_maze * [self._size_maze * [0]]]] 102 | 103 | 104 | def act(self, action): 105 | self._episode_steps += 1 106 | action = self._actions[action] 107 | 108 | reward = -0.1 109 | 110 | if(action==0): 111 | if([self._pos_agent[0]+1,self._pos_agent[1]] not in self._pos_walls): 112 | self._pos_agent[0]=self._pos_agent[0]+1 113 | elif(action==1): 114 | if([self._pos_agent[0],self._pos_agent[1]+1] not in self._pos_walls): 115 | self._pos_agent[1]=self._pos_agent[1]+1 116 | elif(action==2): 117 | if([self._pos_agent[0]-1,self._pos_agent[1]] not in self._pos_walls): 118 | self._pos_agent[0]=self._pos_agent[0]-1 119 | elif(action==3): 120 | if([self._pos_agent[0],self._pos_agent[1]-1] not in self._pos_walls): 121 | self._pos_agent[1]=self._pos_agent[1]-1 122 | 123 | if (self._pos_agent in self._pos_rewards): 124 | reward = 1 125 | self._pos_rewards.remove(self._pos_agent) 126 | 127 | self._mode_score += reward 128 | return reward 129 | 130 | 131 | def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs): 132 | print ("test_data_set.observations.shape") 133 | print (test_data_set.observations()[0][0:1]) 134 | 135 | print ("self._mode_score:"+str(self._mode_score)+".") 136 | 137 | 138 | def inputDimensions(self): 139 | if(self._higher_dim_obs==True): 140 | return [(1,self._size_maze*6,self._size_maze*6)] 141 | else: 142 | return [(1,self._size_maze,self._size_maze)] 143 | 144 | def observationType(self, subject): 145 | return np.float32 146 | 147 | def nActions(self): 148 | return len(self._actions) 149 | 150 | def observe(self): 151 | self._map=np.zeros((self._size_maze,self._size_maze)) 152 | for coord_wall in self._pos_walls: 153 | self._map[coord_wall[0],coord_wall[1]]=1 154 | for coord_reward in self._pos_rewards: 155 | self._map[coord_reward[0],coord_reward[1]]=2 156 | self._map[self._pos_agent[0],self._pos_agent[1]]=0.5 157 | 158 | if(self._higher_dim_obs==True): 159 | indices_reward=np.argwhere(self._map == 2) 160 | indices_agent=np.argwhere(self._map == 0.5) 161 | self._map=self._map/1. 162 | self._map=np.repeat(np.repeat(self._map, 6, axis=0),6, axis=1) 163 | # agent repr 164 | agent_obs=np.zeros((6,6)) 165 | agent_obs[0,2]=0.8 166 | agent_obs[1,0:5]=0.9 167 | agent_obs[2,1:4]=0.9 168 | agent_obs[3,1:4]=0.9 169 | agent_obs[4,1]=0.9 170 | agent_obs[4,3]=0.9 171 | agent_obs[5,0:2]=0.9 172 | agent_obs[5,3:5]=0.9 173 | 174 | # reward repr 175 | reward_obs=np.zeros((6,6)) 176 | reward_obs[:,1]=0.7 177 | reward_obs[0,1:4]=0.6 178 | reward_obs[1,3]=0.7 179 | reward_obs[2,1:4]=0.6 180 | reward_obs[4,2]=0.7 181 | reward_obs[5,2:4]=0.7 182 | 183 | for i in indices_reward: 184 | self._map[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=reward_obs 185 | 186 | for i in indices_agent: 187 | self._map[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=agent_obs 188 | self._map=(self._map*2)-1 #scaling 189 | #print ("self._map higher_dim_obs") 190 | #print (self._map) 191 | #plt.imshow(self._map, cmap='gray_r') 192 | #plt.show() 193 | else: 194 | self._map=self._map/2. 195 | self._map[self._map == 0.5] = 0.99 # agent 196 | self._map[self._map == 1.] = 0.5 # reward 197 | 198 | if(self._reverse==True): 199 | self._map=-self._map #1-self._map 200 | 201 | return [self._map] 202 | 203 | def inTerminalState(self): 204 | if ( self._pos_rewards==[] or (self._mode>=0 and self._episode_steps >= 50) ): 205 | return True 206 | else: 207 | return False 208 | 209 | 210 | 211 | if __name__ == "__main__": 212 | import hashlib 213 | 214 | rng = np.random.RandomState(123456) 215 | env = MyEnv(rng, higher_dim_obs=False) 216 | 217 | maps=[] 218 | for i in range(10000): 219 | env.create_map() 220 | 221 | one_laby=env.observe()[0] 222 | 223 | # Hashing the labyrinths to be able to find duplicates in O(1) 224 | one_laby=int(hashlib.sha1(str(one_laby).encode('utf-8')).hexdigest(), 16) % (10 ** 8) 225 | 226 | # TESTING ADDING DUPLICATION 227 | if i%1000==0: 228 | env.reset(0) 229 | if i%1000==500: 230 | env.reset(1) 231 | 232 | maps.append(copy.deepcopy(one_laby)) 233 | 234 | duplicate_laby=0 235 | for i in range(10000): 236 | env.create_map() 237 | one_laby=env.observe()[0] 238 | 239 | # Hashing the labyrinths to be able to find duplicates in O(1) 240 | one_laby=int(hashlib.sha1(str(one_laby).encode('utf-8')).hexdigest(), 16) % (10 ** 8) 241 | 242 | # TESTING ADDING DUPLICATION 243 | #if i%1000==0: 244 | # maps.append(one_laby) 245 | 246 | # TESTING WITH RESETS 247 | if i%1000==0: 248 | env.reset(0) 249 | if i%1000==500: 250 | env.reset(1) 251 | 252 | duplicate=min(maps.count(one_laby),1) 253 | duplicate_laby+=duplicate 254 | 255 | if i%1000==0: 256 | print ("Number of duplicate labyrinths:"+str(duplicate_laby)+".") 257 | 258 | 259 | 260 | -------------------------------------------------------------------------------- /examples/test_CRAR/run_simple_maze.py: -------------------------------------------------------------------------------- 1 | """Simple maze launcher 2 | 3 | """ 4 | 5 | import sys 6 | import logging 7 | import numpy as np 8 | from joblib import hash, dump, load 9 | import os 10 | 11 | from deer.default_parser import process_args 12 | from deer.agent import NeuralAgent 13 | from deer.learning_algos.CRAR_keras import CRAR 14 | from simple_maze_env import MyEnv as simple_maze_env 15 | import deer.experiment.base_controllers as bc 16 | 17 | from deer.policies import EpsilonGreedyPolicy 18 | 19 | 20 | class Defaults: 21 | # ---------------------- 22 | # Experiment Parameters 23 | # ---------------------- 24 | STEPS_PER_EPOCH = 5000 25 | EPOCHS = 50 26 | STEPS_PER_TEST = 1000 27 | PERIOD_BTW_SUMMARY_PERFS = 1 28 | 29 | # ---------------------- 30 | # Environment Parameters 31 | # ---------------------- 32 | FRAME_SKIP = 2 33 | 34 | # ---------------------- 35 | # DQN Agent parameters: 36 | # ---------------------- 37 | UPDATE_RULE = 'rmsprop' 38 | LEARNING_RATE = 0.0005 39 | LEARNING_RATE_DECAY = 0.9 40 | DISCOUNT = 0.9 41 | DISCOUNT_INC = 1 42 | DISCOUNT_MAX = 0.99 43 | RMS_DECAY = 0.9 44 | RMS_EPSILON = 0.0001 45 | MOMENTUM = 0 46 | CLIP_NORM = 1.0 47 | EPSILON_START = 1.0 48 | EPSILON_MIN = 1.0 49 | EPSILON_DECAY = 10000 50 | UPDATE_FREQUENCY = 1 51 | REPLAY_MEMORY_SIZE = 1000000 #replacing with 200000 will works just fine (in case you dont have 18gb of memory) 52 | BATCH_SIZE = 32 53 | FREEZE_INTERVAL = 1000 54 | DETERMINISTIC = False 55 | 56 | 57 | HIGHER_DIM_OBS = True 58 | 59 | if __name__ == "__main__": 60 | logging.basicConfig(level=logging.INFO) 61 | 62 | # --- Parse parameters --- 63 | parameters = process_args(sys.argv[1:], Defaults) 64 | if parameters.deterministic: 65 | rng = np.random.RandomState(123456) 66 | else: 67 | rng = np.random.RandomState() 68 | 69 | # --- Instantiate environment --- 70 | env = simple_maze_env(rng, higher_dim_obs=HIGHER_DIM_OBS) 71 | 72 | # --- Instantiate learning_algo --- 73 | learning_algo = CRAR( 74 | env, 75 | parameters.rms_decay, 76 | parameters.rms_epsilon, 77 | parameters.momentum, 78 | parameters.clip_norm, 79 | parameters.freeze_interval, 80 | parameters.batch_size, 81 | parameters.update_rule, 82 | rng, 83 | high_int_dim=False, 84 | internal_dim=2) 85 | 86 | test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.) 87 | 88 | # --- Instantiate agent --- 89 | agent = NeuralAgent( 90 | env, 91 | learning_algo, 92 | parameters.replay_memory_size, 93 | max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), 94 | parameters.batch_size, 95 | rng, 96 | test_policy=test_policy) 97 | 98 | # --- Create unique filename for FindBestController --- 99 | h = hash(vars(parameters), hash_name="sha1") 100 | fname = "test_" + h 101 | print("The parameters hash is: {}".format(h)) 102 | print("The parameters are: {}".format(parameters)) 103 | 104 | # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy 105 | # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more 106 | # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every 107 | # episode or epoch (or never, hence the resetEvery='none'). 108 | agent.attach(bc.EpsilonController( 109 | initial_e=parameters.epsilon_start, 110 | e_decays=parameters.epsilon_decay, 111 | e_min=parameters.epsilon_min, 112 | evaluate_on='action', 113 | periodicity=1, 114 | reset_every='none')) 115 | 116 | agent.run(10, 500) 117 | print("end gathering data") 118 | 119 | # --- Bind controllers to the agent --- 120 | # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 121 | # learning rate as well as the training epoch number. 122 | agent.attach(bc.VerboseController( 123 | evaluate_on='epoch', 124 | periodicity=1)) 125 | 126 | # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 127 | # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given. 128 | agent.attach(bc.LearningRateController( 129 | initial_learning_rate=parameters.learning_rate, 130 | learning_rate_decay=parameters.learning_rate_decay, 131 | periodicity=1)) 132 | 133 | # Same for the discount factor. 134 | agent.attach(bc.DiscountFactorController( 135 | initial_discount_factor=parameters.discount, 136 | discount_factor_growth=parameters.discount_inc, 137 | discount_factor_max=parameters.discount_max, 138 | periodicity=1)) 139 | 140 | # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes. 141 | # Plus, we also want to display after each training episode (!= than after every training) the average bellman 142 | # residual and the average of the V values obtained during the last episode, hence the two last arguments. 143 | agent.attach(bc.TrainerController( 144 | evaluate_on='action', 145 | periodicity=parameters.update_frequency, 146 | show_episode_avg_V_value=True, 147 | show_avg_Bellman_residual=True)) 148 | 149 | # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 150 | # has the highest validation score. 151 | # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 152 | # important that the validationID is the same than the id argument of the InterleavedTestEpochController. 153 | # The FindBestController will dump on disk the validation scores for each and every network, as well as the 154 | # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 155 | # of the validation and test scores (see below) or simply recover the resulting neural network for your 156 | # application. 157 | agent.attach(bc.FindBestController( 158 | validationID=simple_maze_env.VALIDATION_MODE, 159 | testID=None, 160 | unique_fname=fname)) 161 | 162 | # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 163 | # "validation epoch" between each training epoch. For each validation epoch, we want also to display the sum of all 164 | # rewards obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env 165 | # every [parameters.period_btw_summary_perfs] *validation* epochs. 166 | agent.attach(bc.InterleavedTestEpochController( 167 | id=simple_maze_env.VALIDATION_MODE, 168 | epoch_length=parameters.steps_per_test, 169 | periodicity=1, 170 | show_score=True, 171 | summarize_every=1)) 172 | 173 | # --- Run the experiment --- 174 | try: 175 | os.mkdir("params") 176 | except Exception: 177 | pass 178 | dump(vars(parameters), "params/" + fname + ".jldump") 179 | agent.gathering_data=False 180 | agent.run(parameters.epochs, parameters.steps_per_epoch) 181 | 182 | # --- Show results --- 183 | basename = "scores/" + fname 184 | scores = load(basename + "_scores.jldump") 185 | print (scores) 186 | # plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b') 187 | # plt.legend() 188 | # plt.xlabel("Number of epochs") 189 | # plt.ylabel("Score") 190 | # plt.savefig(basename + "_scores.pdf") 191 | # plt.show() 192 | -------------------------------------------------------------------------------- /examples/toy_env/Toy_env.py: -------------------------------------------------------------------------------- 1 | """ 2 | The environment simulates the possibility of buying or selling a good. The agent can either have one unit or zero unit of that good. At each transaction with the market, the agent obtains a reward equivalent to the price of the good when selling it and the opposite when buying. In addition, a penalty of 0.5 (negative reward) is added for each transaction. 3 | Two actions are possible for the agent: 4 | - Action 0 corresponds to selling if the agent possesses one unit or idle if the agent possesses zero unit. 5 | - Action 1 corresponds to buying if the agent possesses zero unit or idle if the agent already possesses one unit. 6 | The state of the agent is made up of an history of two punctual observations: 7 | - The price signal 8 | - Either the agent possesses the good or not (1 or 0) 9 | The price signal is build following the same rules for the training and the validation environment. That allows the agent to learn a strategy that exploits this successfully. 10 | 11 | """ 12 | 13 | import numpy as np 14 | from mpl_toolkits.axes_grid1 import host_subplot 15 | import mpl_toolkits.axisartist as AA 16 | import matplotlib.pyplot as plt 17 | 18 | from deer.base_classes import Environment 19 | 20 | class MyEnv(Environment): 21 | 22 | def __init__(self, rng): 23 | """ Initialize environment. 24 | 25 | Parameters 26 | ----------- 27 | rng : the numpy random number generator 28 | """ 29 | # Defining the type of environment 30 | self._last_ponctual_observation = [0, 0] # At each time step, the observation is made up of two elements, each scalar 31 | 32 | self._random_state = rng 33 | 34 | # Building a price signal with some patterns 35 | self._price_signal=[] 36 | for i in range (1000): 37 | price = np.array([0.,0.,0.,-1.,0.,1.,0., 0., 0.]) 38 | price += self._random_state.uniform(0, 3) 39 | self._price_signal.extend(price.tolist()) 40 | 41 | self._price_signal_train = self._price_signal[:len(self._price_signal)//2] 42 | self._price_signal_valid = self._price_signal[len(self._price_signal)//2:] 43 | self._prices = None 44 | self._counter = 1 45 | 46 | def reset(self, mode): 47 | """ Resets the environment for a new episode. 48 | 49 | Parameters 50 | ----------- 51 | mode : int 52 | -1 is for the training phase, others are for validation/test. 53 | 54 | Returns 55 | ------- 56 | list 57 | Initialization of the sequence of observations used for the pseudo-state; dimension must match self.inputDimensions(). 58 | If only the current observation is used as a (pseudo-)state, then this list is equal to self._last_ponctual_observation. 59 | """ 60 | if mode == -1: 61 | self.prices = self._price_signal_train 62 | else: 63 | self.prices = self._price_signal_valid 64 | 65 | 66 | self._last_ponctual_observation = [self.prices[0], 0] 67 | 68 | self._counter = 1 69 | return [6*[0], 0] 70 | 71 | def act(self, action): 72 | """ Performs one time-step within the environment and updates the current observation self._last_ponctual_observation 73 | 74 | Parameters 75 | ----------- 76 | action : int 77 | Integer in [0, ..., N_A] where N_A is the number of actions given by self.nActions() 78 | 79 | Returns 80 | ------- 81 | reward: float 82 | """ 83 | reward = 0 84 | 85 | if (action == 0 and self._last_ponctual_observation[1] == 1): 86 | reward = self.prices[self._counter-1] - 0.5 87 | if (action == 1 and self._last_ponctual_observation[1] == 0): 88 | reward = -self.prices[self._counter-1] - 0.5 89 | 90 | self._last_ponctual_observation[0] = self.prices[self._counter] 91 | self._last_ponctual_observation[1] = action 92 | 93 | self._counter += 1 94 | 95 | return reward 96 | 97 | def summarizePerformance(self, test_data_set, *args, **kwargs): 98 | """ 99 | This function is called at every PERIOD_BTW_SUMMARY_PERFS. 100 | Parameters 101 | ----------- 102 | test_data_set 103 | """ 104 | 105 | print ("Summary Perf") 106 | 107 | observations = test_data_set.observations() 108 | prices = observations[0][100:200] 109 | invest = observations[1][100:200] 110 | 111 | steps=np.arange(len(prices)) 112 | steps_long=np.arange(len(prices)*10)/10. 113 | 114 | #print steps,invest,prices 115 | host = host_subplot(111, axes_class=AA.Axes) 116 | plt.subplots_adjust(right=0.9, left=0.1) 117 | 118 | par1 = host.twinx() 119 | 120 | host.set_xlabel("Time") 121 | host.set_ylabel("Price") 122 | par1.set_ylabel("Investment") 123 | 124 | p1, = host.plot(steps_long, np.repeat(prices,10), lw=3, c = 'b', alpha=0.8, ls='-', label = 'Price') 125 | p2, = par1.plot(steps, invest, marker='o', lw=3, c = 'g', alpha=0.5, ls='-', label = 'Investment') 126 | 127 | par1.set_ylim(-0.09, 1.09) 128 | 129 | 130 | host.axis["left"].label.set_color(p1.get_color()) 131 | par1.axis["right"].label.set_color(p2.get_color()) 132 | 133 | plt.savefig("plot.png") 134 | plt.close() 135 | 136 | print ("A plot of the policy obtained has been saved under the name plot.png") 137 | 138 | def inputDimensions(self): 139 | return [(6,), (1,)] # We consider an observation made up of an history of 140 | # - the last six for the first scalar element obtained 141 | # - the last one for the second scalar element 142 | 143 | 144 | def nActions(self): 145 | return 2 # The environment allows two different actions to be taken at each time step 146 | 147 | 148 | def inTerminalState(self): 149 | return False 150 | 151 | def observe(self): 152 | return np.array(self._last_ponctual_observation) 153 | 154 | 155 | 156 | 157 | def main(): 158 | # Can be used for debug purposes 159 | rng = np.random.RandomState(123456) 160 | myenv = MyEnv(rng) 161 | 162 | print (myenv.observe()) 163 | 164 | if __name__ == "__main__": 165 | main() 166 | -------------------------------------------------------------------------------- /examples/toy_env/run_toy_env.py: -------------------------------------------------------------------------------- 1 | """Toy environment launcher. See the docs for more details about this environment. 2 | 3 | """ 4 | 5 | import sys 6 | import logging 7 | import numpy as np 8 | 9 | from deer.default_parser import process_args 10 | from deer.agent import NeuralAgent 11 | from deer.learning_algos.q_net_keras import MyQNetwork 12 | from Toy_env import MyEnv as Toy_env 13 | import deer.experiment.base_controllers as bc 14 | from deer.policies import EpsilonGreedyPolicy 15 | 16 | 17 | class Defaults: 18 | # ---------------------- 19 | # Experiment Parameters 20 | # ---------------------- 21 | STEPS_PER_EPOCH = 1000 22 | EPOCHS = 50 23 | STEPS_PER_TEST = 500 24 | PERIOD_BTW_SUMMARY_PERFS = 1 25 | 26 | # ---------------------- 27 | # Environment Parameters 28 | # ---------------------- 29 | FRAME_SKIP = 1 30 | 31 | # ---------------------- 32 | # DQN Agent parameters: 33 | # ---------------------- 34 | UPDATE_RULE = 'rmsprop' 35 | LEARNING_RATE = 0.005 36 | LEARNING_RATE_DECAY = 1. 37 | DISCOUNT = 0.9 38 | DISCOUNT_INC = 1. 39 | DISCOUNT_MAX = 0.99 40 | RMS_DECAY = 0.9 41 | RMS_EPSILON = 0.0001 42 | MOMENTUM = 0 43 | CLIP_NORM = 1.0 44 | EPSILON_START = 1.0 45 | EPSILON_MIN = .1 46 | EPSILON_DECAY = 10000 47 | UPDATE_FREQUENCY = 1 48 | REPLAY_MEMORY_SIZE = 1000000 49 | BATCH_SIZE = 32 50 | FREEZE_INTERVAL = 1000 51 | DETERMINISTIC = True 52 | 53 | 54 | if __name__ == "__main__": 55 | logging.basicConfig(level=logging.INFO) 56 | 57 | # --- Parse parameters --- 58 | parameters = process_args(sys.argv[1:], Defaults) 59 | if parameters.deterministic: 60 | rng = np.random.RandomState(123456) 61 | else: 62 | rng = np.random.RandomState() 63 | 64 | # --- Instantiate environment --- 65 | env = Toy_env(rng) 66 | 67 | # --- Instantiate qnetwork --- 68 | qnetwork = MyQNetwork( 69 | env, 70 | parameters.rms_decay, 71 | parameters.rms_epsilon, 72 | parameters.momentum, 73 | parameters.clip_norm, 74 | parameters.freeze_interval, 75 | parameters.batch_size, 76 | parameters.update_rule, 77 | rng) 78 | 79 | train_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.1) 80 | test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.) 81 | 82 | # --- Instantiate agent --- 83 | agent = NeuralAgent( 84 | env, 85 | qnetwork, 86 | parameters.replay_memory_size, 87 | max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), 88 | parameters.batch_size, 89 | rng, 90 | train_policy=train_policy, 91 | test_policy=test_policy) 92 | 93 | # --- Bind controllers to the agent --- 94 | # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 95 | # learning rate as well as the training epoch number. 96 | agent.attach(bc.VerboseController( 97 | evaluate_on='epoch', 98 | periodicity=1)) 99 | 100 | # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes. 101 | # Plus, we also want to display after each training episode (!= than after every training) the average bellman 102 | # residual and the average of the V values obtained during the last episode, hence the two last arguments. 103 | agent.attach(bc.TrainerController( 104 | evaluate_on='action', 105 | periodicity=parameters.update_frequency, 106 | show_episode_avg_V_value=True, 107 | show_avg_Bellman_residual=True)) 108 | 109 | # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 110 | # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given. 111 | agent.attach(bc.LearningRateController( 112 | initial_learning_rate=parameters.learning_rate, 113 | learning_rate_decay=parameters.learning_rate_decay, 114 | periodicity=1)) 115 | 116 | # Same for the discount factor. 117 | agent.attach(bc.DiscountFactorController( 118 | initial_discount_factor=parameters.discount, 119 | discount_factor_growth=parameters.discount_inc, 120 | discount_factor_max=parameters.discount_max, 121 | periodicity=1)) 122 | 123 | # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy 124 | # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more 125 | # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every 126 | # episode or epoch (or never, hence the resetEvery='none'). 127 | agent.attach(bc.EpsilonController( 128 | initial_e=parameters.epsilon_start, 129 | e_decays=parameters.epsilon_decay, 130 | e_min=parameters.epsilon_min, 131 | evaluate_on='action', 132 | periodicity=1, 133 | reset_every='none')) 134 | 135 | # We also want to interleave a "test epoch" between each training epoch. 136 | # For each test epoch, we want also to display the sum of all rewards obtained, hence the showScore=True. 137 | # Finally, we want to call the summarizePerformance method of Toy_Env every [parameters.period_btw_summary_perfs] 138 | # *test* epochs. 139 | agent.attach(bc.InterleavedTestEpochController( 140 | id=0, 141 | epoch_length=parameters.steps_per_test, 142 | periodicity=1, 143 | show_score=True, 144 | summarize_every=parameters.period_btw_summary_perfs)) 145 | 146 | print ("Starting the run of the agent for "+str(parameters.epochs)+" epochs, with "+str(parameters.steps_per_epoch)+" steps per epoch") 147 | # --- Run the experiment --- 148 | agent.run(parameters.epochs, parameters.steps_per_epoch) 149 | -------------------------------------------------------------------------------- /examples/toy_env/run_toy_env_simple.py: -------------------------------------------------------------------------------- 1 | """Toy environment launcher. See the docs for more details about this environment. 2 | 3 | """ 4 | 5 | import numpy as np 6 | 7 | from deer.agent import NeuralAgent 8 | from deer.learning_algos.q_net_keras import MyQNetwork 9 | from Toy_env import MyEnv as Toy_env 10 | import deer.experiment.base_controllers as bc 11 | 12 | 13 | rng = np.random.RandomState(123456) 14 | 15 | # --- Instantiate environment --- 16 | env = Toy_env(rng) 17 | 18 | # --- Instantiate qnetwork --- 19 | qnetwork = MyQNetwork( 20 | environment=env, 21 | random_state=rng) 22 | 23 | # --- Instantiate agent --- 24 | agent = NeuralAgent( 25 | env, 26 | qnetwork, 27 | random_state=rng) 28 | 29 | # --- Bind controllers to the agent --- 30 | # Before every training epoch, we want to print a summary of the agent's epsilon, discount and 31 | # learning rate as well as the training epoch number. 32 | agent.attach(bc.VerboseController()) 33 | 34 | # During training epochs, we want to train the agent after every action it takes. 35 | # Plus, we also want to display after each training episode (!= than after every training) the average bellman 36 | # residual and the average of the V values obtained during the last episode. 37 | agent.attach(bc.TrainerController()) 38 | 39 | # We also want to interleave a "test epoch" between each training epoch. 40 | agent.attach(bc.InterleavedTestEpochController(epoch_length=500)) 41 | 42 | # --- Run the experiment --- 43 | agent.run(n_epochs=100, epoch_length=1000) 44 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF 17 | #formats: 18 | # - pdf 19 | 20 | # Optionally set the version of Python and requirements required to build your docs 21 | #python: 22 | # version: 3.7 23 | # install: 24 | # - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /requirements-docs.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | sphinx 3 | numpydoc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.19 2 | joblib>=0.16 3 | matplotlib>=3.3.2 4 | tensorflow>=2.6 5 | keras>=2.6 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | import deer 4 | 5 | NAME = 'deer' 6 | VERSION = '0.4.3' 7 | AUTHOR = "Vincent Francois-Lavet" 8 | AUTHOR_EMAIL = "vincent.francois@gmail.com" 9 | URL = 'https://github.com/VinF/deer' 10 | DESCRIPTION = 'Framework for deep reinforcement learning' 11 | with open('README.rst') as f: 12 | LONG_DESCRIPTION = f.read() 13 | CLASSIFIERS = [ 14 | 'Development Status :: 3 - Alpha', 15 | 'Environment :: Console', 16 | 'Intended Audience :: Developers', 17 | 'Intended Audience :: Science/Research', 18 | 'Intended Audience :: Education', 19 | 'License :: OSI Approved :: BSD License', 20 | 'Operating System :: OS Independent', 21 | 'Programming Language :: Python :: 2.7', 22 | 'Programming Language :: Python :: 3', 23 | 'Programming Language :: Python :: 3.6', 24 | 'Programming Language :: Python :: 3.7', 25 | 'Topic :: Scientific/Engineering', 26 | 'Topic :: Utilities', 27 | 'Topic :: Software Development :: Libraries', 28 | ] 29 | 30 | if __name__ == '__main__': 31 | setup(name=NAME, 32 | version=VERSION, 33 | author=AUTHOR, 34 | author_email=AUTHOR_EMAIL, 35 | url=URL, 36 | description=DESCRIPTION, 37 | long_description=LONG_DESCRIPTION, 38 | license='BSD', 39 | classifiers=CLASSIFIERS, 40 | platforms='any', 41 | packages=find_packages()) --------------------------------------------------------------------------------