├── .coveragerc
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── ci_scripts
    ├── install.sh
    ├── success.sh
    └── test.sh
├── deer
    ├── __init__.py
    ├── agent.py
    ├── base_classes
    │   ├── __init__.py
    │   ├── environment.py
    │   ├── learning_algo.py
    │   └── policy.py
    ├── default_parser.py
    ├── experiment
    │   ├── __init__.py
    │   └── base_controllers.py
    ├── helper
    │   ├── __init__.py
    │   └── tree.py
    ├── learning_algos
    │   ├── AC_net_keras.py
    │   ├── CRAR_keras.py
    │   ├── NN_CRAR_keras.py
    │   ├── NN_keras.py
    │   ├── NN_keras_LSTM.py
    │   ├── __init__.py
    │   └── q_net_keras.py
    ├── policies
    │   ├── EpsilonGreedyPolicy.py
    │   ├── LongerExplorationPolicy.py
    │   └── __init__.py
    └── tests
    │   ├── __init__.py
    │   └── test_base.py
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── modules
    │   ├── agents.rst
    │   ├── controllers.rst
    │   ├── environments.rst
    │   ├── learning-algorithms.rst
    │   └── policies.rst
    └── user
    │   ├── development.rst
    │   ├── environments.rst
    │   ├── environments
    │       ├── ALE.rst
    │       ├── PLE.rst
    │       ├── gym.rst
    │       ├── planning.rst
    │       ├── toy_env_time_series.rst
    │       └── two_storages.rst
    │   ├── installation.rst
    │   └── tutorial.rst
├── examples
    ├── ALE
    │   ├── ALE_env.py
    │   ├── ALE_env_gym.py
    │   └── run_ALE.py
    ├── MG_two_storages
    │   ├── MG_two_storages_env.py
    │   ├── data
    │   │   ├── BelgiumPV_prod_test.npy
    │   │   ├── BelgiumPV_prod_train.npy
    │   │   ├── example_nondeterminist_cons_test.npy
    │   │   ├── example_nondeterminist_cons_train.npy
    │   │   └── spotmarket_data_2007-2013.xls
    │   ├── plot_MG_operation.py
    │   └── run_MG_two_storages.py
    ├── gym
    │   ├── mountain_car_continuous_env.py
    │   ├── mountain_car_env.py
    │   ├── pendulum_env.py
    │   ├── run_mountain_car.py
    │   ├── run_mountain_car_continuous.py
    │   └── run_pendulum.py
    ├── maze
    │   ├── a_star_path_finding.py
    │   ├── maze_env.py
    │   └── run_maze.py
    ├── test_CRAR
    │   ├── catcher_env.py
    │   ├── run_catcher.py
    │   ├── run_simple_maze.py
    │   └── simple_maze_env.py
    └── toy_env
    │   ├── Toy_env.py
    │   ├── run_toy_env.py
    │   └── run_toy_env_simple.py
├── readthedocs.yml
├── requirements-docs.txt
├── requirements.txt
└── setup.py


/.coveragerc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/.coveragerc


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.pyproj
3 | /.vs/General_Deep_Q_RL/v14
4 | /General_Deep_Q_RL.sln
5 | /General_Deep_Q_RL/theano.py
6 | /General_Deep_Q_RL/plot.png
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: trusty
 2 | language: python
 3 | 
 4 | cache:
 5 |   apt: true
 6 |   # We use three different cache directory
 7 |   # to work around a Travis bug with multi-platform cache
 8 |   directories:
 9 |   - $HOME/.cache/pip
10 |   - $HOME/download
11 | 
12 | env:
13 |   global:
14 |     # Directory where tests are run from
15 |     - TEST_DIR=/tmp/test_dir/
16 |     - MODULE=deer
17 | #    - THEANO_VERSION="0.8"
18 | #    - NUMPY_VERSION="1.10"
19 | #    - SCIPY_VERSION="0.17"
20 |   matrix:
21 |     - PYTHON_VERSION="2.7"
22 |     - PYTHON_VERSION="3.8"
23 | #    - PYTHON_VERSION="3.8" EXAMPLE="toy_env"
24 | #    - PYTHON_VERSION="3.8" EXAMPLE="mountain_car"
25 | 
26 | install: source ci_scripts/install.sh
27 | script: bash ci_scripts/test.sh
28 | #after_success: source ci_scripts/success.sh
29 | 
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Vincent Francois-Lavet, David Taralla
 2 | All rights reserved.
 3 | 
 4 | Inspired from "Human-level control through deep reinforcement learning",
 5 | Nature, 518(7540):529-533, February 2015 and the implementation of Nathan 
 6 | Sprague (https://github.com/spragunr/deep_q_rl)
 7 | 
 8 | This software is released under the 3-Clause BSD license.
 9 | 
10 | Redistribution and use in source and binary forms, with or without
11 | modification, are permitted provided that the following conditions are met:
12 |     * Redistributions of source code must retain the above copyright
13 |       notice, this list of conditions and the following disclaimer.
14 |     * Redistributions in binary form must reproduce the above copyright
15 |       notice, this list of conditions and the following disclaimer in the
16 |       documentation and/or other materials provided with the distribution.
17 |     * Neither the name of the <organization> nor the
18 |       names of its contributors may be used to endorse or promote products
19 |       derived from this software without specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md LICENSE *.rst
2 | recursive-include docs *.rst
3 | recursive-include docs *.py
4 | prune docs/_build


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. -*- mode: rst -*-
 2 | 
 3 | |Python27|_ |Python36|_ |PyPi|_ |License|_
 4 | 
 5 | .. |Python27| image:: https://img.shields.io/badge/python-2.7-blue.svg
 6 | .. _Python27: https://badge.fury.io/py/deer
 7 | 
 8 | .. |Python36| image:: https://img.shields.io/badge/python-3.6-blue.svg
 9 | .. _Python36: https://badge.fury.io/py/deer
10 | 
11 | .. |PyPi| image:: https://badge.fury.io/py/deer.svg
12 | .. _PyPi: https://badge.fury.io/py/deer
13 | 
14 | .. |License| image:: https://img.shields.io/badge/license-MIT-blue.svg
15 | .. _License: https://github.com/VinF/deer/blob/master/LICENSE
16 | 
17 | DeeR
18 | ====
19 | 
20 | DeeR is a python library for Deep Reinforcement. It is build with modularity in mind so that it can easily be adapted to any need. It provides many possibilities out of the box such as Double Q-learning, prioritized Experience Replay, Deep deterministic policy gradient (DDPG), Combined Reinforcement via Abstract Representations (CRAR). Many different environment examples are also provided (some of them using OpenAI gym).
21 | 
22 | Dependencies
23 | ============
24 | 
25 | This framework is tested to work under Python 3.6.
26 | 
27 | The required dependencies are NumPy >= 1.10, joblib >= 0.9. You also need Keras>=2.6.
28 | 
29 | For running the examples, Matplotlib >= 1.1.1 is required.
30 | For running the atari games environment, you need to install ALE >= 0.4.
31 | 
32 | Full Documentation
33 | ==================
34 | 
35 | The documentation is available at : http://deer.readthedocs.io/
36 | 


--------------------------------------------------------------------------------
/ci_scripts/install.sh:
--------------------------------------------------------------------------------
 1 | # inspired from scikit-learn contrib
 2 | 
 3 | # Deactivate the travis-provided virtual environment and setup a
 4 | # conda-based environment instead
 5 | deactivate
 6 | 
 7 | # Use the miniconda installer for faster download / install of conda
 8 | # itself
 9 | pushd .
10 | cd
11 | mkdir -p download
12 | cd download
13 | echo "Cached in $HOME/download :"
14 | ls -l
15 | echo
16 | if [[ ! -f miniconda.sh ]]
17 |    then
18 |    wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ #Miniconda3-4.5.4-Linux-x86_64.sh \
19 |        -O miniconda.sh
20 |    fi
21 | chmod +x miniconda.sh && ./miniconda.sh -b
22 | cd ..
23 | ls /home/travis
24 | export PATH=/home/travis/miniconda/bin:$PATH
25 | export PATH=/home/travis/miniconda2/bin:$PATH
26 | export PATH=/home/travis/miniconda3/bin:$PATH
27 | conda update --yes conda
28 | popd
29 | 
30 | # Configure the conda environment and put it in the path using the
31 | # provided versions
32 | conda create -n testenv --yes python=$PYTHON_VERSION pip nose \
33 |       numpy
34 | 
35 | #conda install libgcc -y
36 | source activate testenv
37 | pip install --upgrade pip
38 | pip install scipy
39 | pip install tensorflow-cpu
40 | pip install keras
41 | pip install matplotlib
42 | pip install joblib
43 | #pip install cython
44 | 
45 | #if [[ "$PYTHON_VERSION" == "2.7" ]]; then
46 | #    pip install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.1-cp27-none-linux_x86_64.whl # tensorflow
47 | #elif [[ "$PYTHON_VERSION" == "3.5" ]]; then
48 | #    pip install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.10.1-cp35-cp35m-linux_x86_64.whl 
49 | #fi
50 | 
51 | if [[ "$COVERAGE" == "true" ]]; then
52 |     pip install coverage coveralls
53 | fi
54 | 
55 | python --version
56 | python -c "import numpy; print('numpy %s' % numpy.__version__)"
57 | python -c "import scipy; print('scipy %s' % scipy.__version__)"
58 | #python -c "import theano; print('theano %s' % theano.__version__)"
59 | python -c "import tensorflow; print('tensorflow %s' % tensorflow.__version__)"
60 | 
61 | python setup.py develop
62 | 


--------------------------------------------------------------------------------
/ci_scripts/success.sh:
--------------------------------------------------------------------------------
 1 | # inspired from scikit-learn contrib
 2 | 
 3 | set -e
 4 | 
 5 | if [[ "$COVERAGE" == "true" ]]; then
 6 |     # Need to run coveralls from a git checkout, so we copy .coverage
 7 |     # from TEST_DIR where nosetests has been run
 8 |     cp $TEST_DIR/.coverage $TRAVIS_BUILD_DIR
 9 |     cd $TRAVIS_BUILD_DIR
10 |     # Ignore coveralls failures as the coveralls server is not
11 |     # very reliable but we don't want travis to report a failure
12 |     # in the github UI just because the coverage report failed to
13 |     # be published.
14 |     coveralls || echo "Coveralls upload failed"
15 | fi
16 | 


--------------------------------------------------------------------------------
/ci_scripts/test.sh:
--------------------------------------------------------------------------------
 1 | # inspired from scikit-learn contrib
 2 | 
 3 | set -e
 4 | 
 5 | if [[ "$EXAMPLE" == "toy_env" ]]; then
 6 |     cd examples/toy_env
 7 |     python run_toy_env.py --epochs 5
 8 |     python run_toy_env_simple.py & sleep 30; kill $!
 9 | 
10 | elif [[ "$EXAMPLE" == "mountain_car" ]]; then
11 |     pip install gym
12 |     cd examples/gym
13 |     python run_mountain_car.py  --epochs 5
14 | 
15 | #    pip -V pip
16 | #    python run_mountain_car_continuous.py  --epochs 5
17 | 
18 | else
19 |     # Get into a temp directory to run test from the installed and
20 |     # check if we do not leave artifacts
21 |     mkdir -p $TEST_DIR
22 | 
23 |     cd $TEST_DIR
24 | 
25 |     if [[ "$COVERAGE" == "true" ]]; then
26 |         nosetests -vs --with-coverage --cover-package=$MODULE $MODULE
27 |     else
28 |         nosetests -vs $MODULE
29 |     fi
30 | 
31 | fi
32 | 


--------------------------------------------------------------------------------
/deer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/deer/__init__.py


--------------------------------------------------------------------------------
/deer/base_classes/__init__.py:
--------------------------------------------------------------------------------
1 | from .environment import Environment
2 | from .learning_algo import LearningAlgo
3 | from .policy import Policy


--------------------------------------------------------------------------------
/deer/base_classes/environment.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module defines the base class for the environments.
  3 | 
  4 | """
  5 | 
  6 | import numpy as np
  7 | 
  8 | class Environment(object): 
  9 |     """All your Environment classes should inherit this interface.
 10 |     
 11 |     The environment defines the dynamics and the reward signal that the agent observes when interacting with it.
 12 |     
 13 |     An agent sees at any time-step from the environment a collection of observable elements. Observing the environment 
 14 |     at time t thus corresponds to obtaining a punctual observation for each of these elements. According to the control 
 15 |     problem to solve, it might be useful for the agent to not only take action based on the current punctual observations 
 16 |     but rather on a collection of the last punctual observations. In this framework, it's the environment that defines 
 17 |     the number of each punctual observation to be considered.
 18 | 
 19 |     Different "modes" are used in this framework to allow the environment to have different dynamics and/or reward signal. 
 20 |     For instance, in training mode, only a part of the dynamics may be available so that it is possible to see how well 
 21 |     the agent generalizes to a slightly different one.
 22 |     """
 23 |                
 24 |     def reset(self, mode):
 25 |         """Resets the environment and put it in mode [mode]. This function is called when beginning every new episode. 
 26 |         
 27 |         The [mode] can be used to discriminate for instance between an agent which is training or trying to get a 
 28 |         validation or generalization score. The mode the environment is in should always be redefined by resetting the
 29 |         environment using this method, meaning that the mode should be preserved until the next call to reset().
 30 | 
 31 |         Parameters
 32 |         -----------
 33 |         mode : int
 34 |             The mode to put the environment into. Mode "-1" is reserved and always means "training".
 35 | 
 36 |         Returns
 37 |         -------
 38 |         Initialization of the pseudo state at the beginning of a new episode: list (of lists) with size given by inputDimensions
 39 |         """
 40 | 
 41 |         raise NotImplementedError()
 42 |         
 43 |     def act(self, action):
 44 |         """Applies the agent action [action] on the environment.
 45 | 
 46 |         Parameters
 47 |         -----------
 48 |         action : int
 49 |             The action selected by the agent to operate on the environment. Should be an identifier 
 50 |             included between 0 included and nActions() excluded.
 51 |         """
 52 | 
 53 |         raise NotImplementedError()
 54 | 
 55 |     def inputDimensions(self):
 56 |         """Gets the shape of the input space for this environment.
 57 |         
 58 |         This returns a list whose length is the number of observations in the environment. Each element of the list is a tuple: 
 59 |         the first integer is always the history size considered for this observation and the rest describes the shape of the 
 60 |         observation at a given time step. For instance:
 61 |         - () or (1,) means each observation at a given time step is a single scalar and the history size is 1 (= only current 
 62 |         observation)
 63 |         - (N,) means each observation at a given time step is a single scalar and the history size is N
 64 |         - (N, M) means each observation at a given time step is a vector of length M and the history size is N
 65 |         - (N, M1, M2) means each observation at a given time step is a 2D matrix with M1 rows and M2 columns and the history 
 66 |         size is N
 67 |         """
 68 | 
 69 |         raise NotImplementedError()
 70 | 
 71 |     def nActions(self):
 72 |         """Gets the number of different actions that can be taken on this environment.
 73 |         It can be either an integer in the case of a finite discrete number of actions 
 74 |         or it can be a list of couples [min_action_value,max_action_value] for a continuous action space"""
 75 | 
 76 |         raise NotImplementedError()
 77 | 
 78 |     def inTerminalState(self):
 79 |         """Tells whether the environment reached a terminal state after the last transition (i.e. the last transition 
 80 |         that occured was terminal).
 81 | 
 82 |         As the majority of control tasks considered have no end (a continuous control should be operated), by default 
 83 |         this returns always False. But in the context of a video game for instance, terminal states can happen and in
 84 |         these cases, this method should be overridden.
 85 |         
 86 |         Returns
 87 |         -------
 88 |         isTerminal : bool
 89 |             Whether or not the current state is terminal
 90 |         """
 91 | 
 92 |         return False
 93 | 
 94 |     def observe(self):
 95 |         """Gets a list of punctual observations composing this environment.
 96 |         
 97 |         This returns a list where element i is a punctual observation. Note that the history  of observations is not 
 98 |         returned and only the current observation is.
 99 | 
100 |         See the documentation of inputDimensions() for more information about the shape of the observations.
101 |         """
102 | 
103 |         raise NotImplementedError()
104 | 
105 |     def summarizePerformance(self, test_data_set, *args, **kwargs):
106 |         """Optional hook that can be used to show a summary of the performance of the agent on the
107 |         environment in the current mode.
108 | 
109 |         Parameters
110 |         -----------
111 |         test_data_set : agent.DataSet 
112 |             The dataset maintained by the agent in the current mode, which contains 
113 |             observations, actions taken and rewards obtained, as well as wether each transition was terminal or 
114 |             not. Refer to the documentation of agent.DataSet for more information.
115 |         """
116 | 
117 |         pass
118 | 
119 |     def observationType(self, subject):
120 |         """Gets the most inner type (np.uint8, np.float32, ...) of [subject].
121 | 
122 |         Parameters
123 |         -----------
124 |         subject : int
125 |             The subject
126 |         """
127 | 
128 |         return np.float32
129 | 
130 |     def end(self):
131 |         """Optional hook called at the end of all epochs
132 |         """
133 | 
134 |         pass
135 | 


--------------------------------------------------------------------------------
/deer/base_classes/learning_algo.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module defines the base class for the learning algorithms.
 3 | 
 4 | """
 5 | 
 6 | 
 7 | class LearningAlgo(object):
 8 |     """ All the Q-networks, actor-critic networks, etc. should inherit this interface.
 9 | 
10 |     Parameters
11 |     -----------
12 |     environment : object from class Environment
13 |         The environment linked to the Q-network
14 |     batch_size : int
15 |         Number of tuples taken into account for each iteration of gradient descent
16 |     """
17 |     def __init__(self, environment, batch_size):
18 |         self._environment = environment
19 |         self._df = 0.9
20 |         self._lr = 0.005
21 |         self._input_dimensions = self._environment.inputDimensions()
22 |         self._n_actions = self._environment.nActions()
23 |         self._batch_size = batch_size
24 | 
25 |     def train(self, states, actions, rewards, nextStates, terminals):
26 |         """ This method performs the training step (e.g. using Bellman iteration in a deep Q-network) 
27 |         for one batch of tuples.
28 |         """
29 |         raise NotImplementedError()
30 | 
31 |     def chooseBestAction(self, state):
32 |         """ Get the best action for a pseudo-state
33 |         """        
34 |         raise NotImplementedError()
35 | 
36 |     def qValues(self, state):
37 |         """ Get the q value for one pseudo-state
38 |         """        
39 |         raise NotImplementedError()
40 | 
41 |     def setLearningRate(self, lr):
42 |         """ Setting the learning rate
43 |         NB: The learning rate has usually to be set in the optimizer, hence this function should
44 |         be overridden. Otherwise, the learning rate change is likely not to be taken into account
45 | 
46 |         Parameters
47 |         -----------
48 |         lr : float
49 |             The learning rate that has to bet set
50 |         """
51 |         self._lr = lr
52 | 
53 |     def setDiscountFactor(self, df):
54 |         """ Setting the discount factor
55 | 
56 |         Parameters
57 |         -----------
58 |         df : float
59 |             The discount factor that has to bet set
60 |         """
61 |         if df < 0. or df > 1.:
62 |             raise AgentError("The discount factor should be in [0,1]")
63 | 
64 |         self._df = df
65 | 
66 |     def learningRate(self):
67 |         """ Getting the learning rate
68 |         """
69 |         return self._lr
70 | 
71 |     def discountFactor(self):
72 |         """ Getting the discount factor
73 |         """
74 |         return self._df
75 | 
76 | if __name__ == "__main__":
77 |     pass
78 | 


--------------------------------------------------------------------------------
/deer/base_classes/policy.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module defines the base class for the policies.
 3 | 
 4 | """
 5 | 
 6 | import numpy as np
 7 | 
 8 | class Policy(object):
 9 |     """Abstract class for all policies.
10 |     A policy takes observations as input, and outputs an action.
11 | 
12 |     Parameters
13 |     -----------
14 |     learning_algo : object from class LearningALgo
15 |     n_actions : int or list
16 |         Definition of the action space provided by Environment.nActions()
17 |     random_state : numpy random number generator
18 |     """
19 | 
20 |     def __init__(self, learning_algo, n_actions,random_state):
21 |         self.learning_algo = learning_algo
22 |         self.n_actions = n_actions
23 |         self.random_state = random_state
24 | 
25 |         pass
26 | 
27 |     def bestAction(self, state, mode=None, *args, **kwargs):
28 |         """ Returns the best Action for the given state. This is an additional encapsulation for q-network.
29 |         """
30 |         action,V = self.learning_algo.chooseBestAction(state, mode, *args, **kwargs)
31 |         return action, V
32 | 
33 |     def randomAction(self):
34 |         """ Returns a random action
35 |         """
36 |         if ( isinstance(self.n_actions,int)):
37 |             # Discrete set of actions [0,nactions[
38 |             action = self.random_state.randint(0, self.n_actions)
39 |         else:
40 |             # Continuous set of actions
41 |             action=[]
42 |             for a in self.n_actions:
43 |                 action.append( self.random_state.uniform(a[0],a[1]) )
44 |             action=np.array(action)
45 | 
46 |         V = 0
47 |         return action, V
48 | 
49 | 
50 |     def action(self, state):
51 |         """Main method of the Policy class. It can be called by agent.py, given a state,
52 |         and should return a valid action w.r.t. the environment given to the constructor.
53 |         """
54 |         raise NotImplementedError()
55 | 


--------------------------------------------------------------------------------
/deer/default_parser.py:
--------------------------------------------------------------------------------
  1 | """This module contains a function to help parse command-line arguments.
  2 | 
  3 | """
  4 | 
  5 | 
  6 | import argparse
  7 | 
  8 | def process_args(args, defaults):
  9 |     """Handle the command line and return an object containing all the parameters.
 10 | 
 11 |     Arguments:
 12 |         args     - list of command line arguments (not including executable name)
 13 |         defaults - a name space with variables corresponding to each of the required default command line values.
 14 |     """
 15 | 
 16 |     parser = argparse.ArgumentParser()
 17 |     parser.add_argument('-e', '--epochs', dest="epochs", type=int,
 18 |                         default=defaults.EPOCHS,
 19 |                         help='Number of training epochs (default: %(default)s)')
 20 |     parser.add_argument('-s', '--steps-per-epoch', dest="steps_per_epoch",
 21 |                         type=int, default=defaults.STEPS_PER_EPOCH,
 22 |                         help='Number of steps per epoch (default: %(default)s)')
 23 |     parser.add_argument('-t', '--test-length', dest="steps_per_test",
 24 |                         type=int, default=defaults.STEPS_PER_TEST,
 25 |                         help='Number of steps per test (default: %(default)s)')
 26 |     parser.add_argument('-f', '--freq_summary_perfs', dest="period_btw_summary_perfs",
 27 |                         type=int, default=defaults.PERIOD_BTW_SUMMARY_PERFS,
 28 |                         help='freq summary perfs (default: %(default)s)')
 29 |     parser.add_argument('--frame-skip', dest="frame_skip",
 30 |                         default=defaults.FRAME_SKIP, type=int,
 31 |                         help='Every how many frames to process '
 32 |                         '(default: %(default)s)')
 33 |     parser.add_argument('--update-rule', dest="update_rule",
 34 |                         type=str, default=defaults.UPDATE_RULE,
 35 |                         help=('deepmind_rmsprop|rmsprop|sgd ' +
 36 |                               '(default: %(default)s)'))
 37 |     parser.add_argument('--learning-rate', dest="learning_rate",
 38 |                         type=float, default=defaults.LEARNING_RATE,
 39 |                         help='Learning rate (default: %(default)s)')
 40 |     parser.add_argument('--learning-rate-decay', dest="learning_rate_decay",
 41 |                         type=float, default=defaults.LEARNING_RATE_DECAY,
 42 |                         help='Learning rate (default: %(default)s)')
 43 |     parser.add_argument('--rms-decay', dest="rms_decay",
 44 |                         type=float, default=defaults.RMS_DECAY,
 45 |                         help='Decay rate for rms_prop (default: %(default)s)')
 46 |     parser.add_argument('--rms-epsilon', dest="rms_epsilon",
 47 |                         type=float, default=defaults.RMS_EPSILON,
 48 |                         help='Denominator epsilson for rms_prop ' +
 49 |                         '(default: %(default)s)')
 50 |     parser.add_argument('--momentum', type=float, default=defaults.MOMENTUM,
 51 |                         help=('Momentum term for Nesterov momentum. '+
 52 |                               '(default: %(default)s)'))
 53 |     parser.add_argument('--clip-norm', dest="clip_norm", type=float,
 54 |                         default=defaults.CLIP_NORM,
 55 |                         help=('Max L2 norm for the gradient. ' +
 56 |                               '(default: %(default)s)'))
 57 |     parser.add_argument('--discount', type=float, default=defaults.DISCOUNT,
 58 |                         help='Discount rate init')
 59 |     parser.add_argument('--discount_inc', type=float, default=defaults.DISCOUNT_INC,
 60 |                         help='Discount rate')
 61 |     parser.add_argument('--discount_max', type=float, default=defaults.DISCOUNT_MAX,
 62 |                         help='Discount rate max')
 63 |     parser.add_argument('--epsilon-start', dest="epsilon_start",
 64 |                         type=float, default=defaults.EPSILON_START,
 65 |                         help=('Starting value for epsilon. ' +
 66 |                               '(default: %(default)s)'))
 67 |     parser.add_argument('--epsilon-min', dest="epsilon_min",
 68 |                         type=float, default=defaults.EPSILON_MIN,
 69 |                         help='Minimum epsilon. (default: %(default)s)')
 70 |     parser.add_argument('--epsilon-decay', dest="epsilon_decay",
 71 |                         type=float, default=defaults.EPSILON_DECAY,
 72 |                         help=('Number of steps to minimum epsilon. ' +
 73 |                               '(default: %(default)s)'))
 74 |     parser.add_argument('--max-history', dest="replay_memory_size",
 75 |                         type=int, default=defaults.REPLAY_MEMORY_SIZE,
 76 |                         help=('Maximum number of steps stored in replay ' +
 77 |                               'memory. (default: %(default)s)'))
 78 |     parser.add_argument('--batch-size', dest="batch_size",
 79 |                         type=int, default=defaults.BATCH_SIZE,
 80 |                         help='Batch size. (default: %(default)s)')
 81 |     parser.add_argument('--freeze-interval', dest="freeze_interval",
 82 |                         type=int, default=defaults.FREEZE_INTERVAL,
 83 |                         help=('Interval between target freezes. ' +
 84 |                               '(default: %(default)s)'))
 85 |     parser.add_argument('--update-frequency', dest="update_frequency",
 86 |                         type=int, default=defaults.UPDATE_FREQUENCY,
 87 |                         help=('Number of actions before each SGD update. '+
 88 |                               '(default: %(default)s)'))
 89 |     parser.add_argument('--deterministic', dest='deterministic', action='store_true',
 90 |                         help=('If fixed seed (default: %(default)s)'))
 91 |     parser.add_argument('--no-deterministic', dest='deterministic', action='store_false',
 92 |                         help=('If no fixed seed'))
 93 |     parser.set_defaults(deterministic=defaults.DETERMINISTIC)
 94 |     parser.add_argument('--param1', dest="param1") # Additional parameter depending on the environment
 95 |     parser.add_argument('--param2', dest="param2") # Additional parameter depending on the environment
 96 |     parser.add_argument('--param3', dest="param3") # Additional parameter depending on the environment
 97 | 
 98 |     parameters = parser.parse_args(args)
 99 | 
100 |     return parameters
101 | 
102 | if __name__ == '__main__':
103 |     pass
104 | 


--------------------------------------------------------------------------------
/deer/experiment/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/deer/experiment/__init__.py


--------------------------------------------------------------------------------
/deer/helper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/deer/helper/__init__.py


--------------------------------------------------------------------------------
/deer/helper/tree.py:
--------------------------------------------------------------------------------
  1 | """ Implementation of a binary tree for prioritized experience replay.
  2 | Each leaf node is a past experience with its associated priority.
  3 | Each parent node is the sum of the priorities of its children. 
  4 | The tree data structure serves purpose of efficient O(log(n)) priority 
  5 | update and random batch generation.
  6 | 
  7 | One may check out Schaul et al. (2016) - Prioritized Experience Replay.
  8 | 
  9 | Author: Aaron Zixiao Qiu
 10 | """
 11 | 
 12 | import numpy as np
 13 | 
 14 | class Node:
 15 |     def __init__(self, position=-1, priority=0, end=-1):
 16 |         """ The information contained in each node is:
 17 |         - Children and parent
 18 |         - Position: indice of the transition in the replay memory, i.e.
 19 |           the circular buffer used for storing the experiences
 20 |         - Priority: sum of the priorities of the children. If leaf node, 
 21 |           then it is the priority of the transition.
 22 |         - End: variable used for tree search based on Position
 23 | 
 24 |         """
 25 |         self.left = None
 26 |         self.right = None
 27 |         self.parent = None
 28 |         self.position = position
 29 |         self.priority = priority
 30 |         self.end = end
 31 | 
 32 |     def hasChildren(self):
 33 |         if (self.right == None and self.left == None):
 34 |             return False
 35 |         return True
 36 | 
 37 | class SumTree:
 38 |     def __init__(self, size):
 39 |         """ The tree does not implement any insert-related method 
 40 |         because the idea is to initialize the tree to have the same 
 41 |         number of leaves as the size of the replay memory. 
 42 |         """
 43 | 
 44 |         self._root = Node()
 45 |         size_left = int(size/2)
 46 |         # Initialization of the tree
 47 |         self._root.left = self._createSubtree(self._root, 0, size_left) # [a,b[
 48 |         self._root.right = self._createSubtree(self._root, size_left, size)
 49 |         self._max_priority = 1
 50 | 
 51 |     def _createSubtree(self, parent, begin, end):
 52 |         """ Build balanced subtrees. 
 53 |         The leaf nodes have their "priority" initialized to 0 and 
 54 |         "position" from 0 to n-1, with n being the size of the replay
 55 |         memory.
 56 |         The inner nodes are built while setting their "end" value that 
 57 |         is used to position based search in the tree.
 58 | 
 59 |         Arguments:
 60 |             parent - parent node
 61 |             begin - lower bound of the range of positions
 62 |             end - upper bound (excluded) of the range of positions 
 63 |         Return:
 64 |             node - root of the subtree
 65 |         """
 66 |         n_elem = end - begin
 67 |         if (n_elem == 1):
 68 |              node = Node(position=begin)
 69 |              node.parent = parent
 70 |              node.end = end
 71 |              return node
 72 | 
 73 |         # At least 2 values (leaves) left
 74 |         mid = int((end + begin)/2)
 75 |         node = Node(end=end)
 76 |         node.parent = parent
 77 |         node.left = self._createSubtree(node, begin, mid)
 78 |         node.right = self._createSubtree(node, mid, end)
 79 |         return node
 80 | 
 81 |     def update(self, index, priority=-1):
 82 |         """ Update a leaf and the tree priorities. 
 83 |         When the replay memory is updated with a new transition, it is 
 84 |         also updated in the tree. The priority of the successive parent
 85 |         nodes are also modified.
 86 |         The function is also used to update the priority of an existing
 87 |         transtion after it has been replayed.
 88 | 
 89 |         Arguments:
 90 |             index - index of the leaf corresponding to the index of the 
 91 |                     new transition in the replay memory
 92 |             priority - the new priority of the leaf
 93 |         """
 94 |         if (priority == -1):
 95 |             priority = self._max_priority
 96 |         elif (priority > self._max_priority):
 97 |             self._max_priority = priority
 98 | 
 99 |         # Search for index
100 |         node = self.findIndex(index)
101 | 
102 |         # Replace with new priority
103 |         diff = priority - node.priority
104 |         node.priority = priority
105 | 
106 |         # Update value
107 |         self._updateValue(node.parent, diff)
108 | 
109 |     def _updateValue(self, node, diff):
110 |         node.priority += diff
111 |         if (node.parent != None):
112 |             self._updateValue(node.parent, diff)
113 | 
114 |     def findIndex(self, index):
115 |         """ Find a leaf based on the index. 
116 | 
117 |         Arguments:
118 |             index - integer between 0 and n-1, n being the size of the 
119 |                     replay memory
120 |         Return:
121 |             node - leaf with the index
122 |         """
123 |         if(self._root != None):
124 |             return self._findIndex(index, self._root)
125 |         else:
126 |             return None
127 | 
128 |     def _findIndex(self, index, node):
129 |         if (node.position == index):
130 |             return node
131 | 
132 |         if (index < node.left.end):
133 |             return self._findIndex(index, node.left)
134 |         else:
135 |             return self._findIndex(index, node.right)
136 | 
137 |     def getBatch(self, n, rng, dataset):
138 |         """ Generate the indices of a random batch of size n.
139 |         The samples within the random batch are selected following
140 |         the priorities (probabilities) of each transition in the replay
141 |         memory.
142 |         
143 |         Argument:
144 |             rng - number of elements in the random batch
145 |         Return:
146 |             indices - list with indices drawn w.r.t. the transition 
147 |                       priorities.
148 |         """
149 |         pmax = self._root.priority
150 |         step = pmax / n
151 |         indices = np.zeros(n, dtype='int32')
152 |         for i in range(n):
153 |             p = rng.uniform(i*step, (i+1)*step)
154 |             node = self.find(p)
155 |             index = self._checkTerminal(node.position, dataset)
156 |             if (index >= 0):
157 |                 indices[i] = index
158 |             else:
159 |                 return np.zeros(0)
160 | 
161 |         return indices
162 | 
163 |     def _checkTerminal(self, index, dataset):
164 |         """ Avoid terminal states in the x samples preceding the chosen 
165 |         index.
166 |         
167 |         Argument:
168 |             index - chosen index based on priority
169 |             dataset - contains the circular buffers
170 |         Return:
171 |             index - checked or corrected value of the input index.
172 |         """
173 |         history_size = dataset._max_history_size
174 |         terminals = dataset._terminals
175 |         n_elems = dataset.n_elems
176 | 
177 |         lower_bound = history_size - 1
178 | 
179 |         # Check if the index is valid wrt terminals
180 |         first_try = index
181 |         start_wrapped = False
182 |         while True:
183 |             i = index - 1
184 |             processed = 0
185 |             for _ in range(history_size - 1):
186 |                 if (i < 0 or terminals[i]):
187 |                     break;
188 | 
189 |                 i -= 1
190 |                 processed += 1
191 | 
192 |             if (processed < history_size - 1):
193 |                 # if we stopped prematurely, shift slice to the left and try again
194 |                 index = i
195 |                 if (index < lower_bound):
196 |                     start_wrapped = True
197 |                     index = n_elems - 1
198 |                 if (start_wrapped and index <= first_try):
199 |                     return -1
200 |             else:
201 |                 # else index was ok according to terminals
202 |                 return index
203 | 
204 |     def find(self, priority):
205 |         """ Find a leaf based on the priority. 
206 | 
207 |         Arguments:
208 |             priority - the target priority generated randomly
209 |         Return:
210 |             node - the closest leaf node with a greater priority
211 |         """
212 |         if(self._root != None):
213 |             return self._find(priority, self._root)
214 |         else:
215 |             return None
216 | 
217 |     def _find(self, priority, node):
218 |         if (not node.hasChildren()):
219 |             return node
220 | 
221 |         if(priority <= node.left.priority):
222 |             return self._find(priority, node.left)
223 |         else:
224 |             return self._find(priority - node.left.priority, node.right)
225 | 
226 |     def printTree(self):
227 |     # Classical printout method. Mostly for debugging purposes.
228 |         if(self._root != None):
229 |             self._printTree(self._root)
230 | 
231 |         print("===============")
232 | 
233 |     def _printTree(self, node):
234 |         if(node != None):
235 |             self._printTree(node.left)
236 |             print(node.position, node.priority)
237 |             self._printTree(node.right)
238 |         
239 | 
240 | if __name__ == "__main__":
241 |     t = SumTree(10)
242 |     t.update(1, 1)
243 |     t.update(2, 0.2)
244 |     t.update(3, 3.3)
245 |     t.update(4, 2.5)
246 |     t.update(6, 2)
247 |     t.printTree()
248 | 
249 |     rng = np.random.RandomState()
250 |     for _ in range(10):
251 |         print(t.getBatch(10, rng))
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/deer/learning_algos/AC_net_keras.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Code for the actor-critic "DDPG" (https://arxiv.org/abs/1509.02971)
  3 | 
  4 | """
  5 | 
  6 | import numpy as np
  7 | from ..base_classes import LearningAlgo as ACNetwork
  8 | from .NN_keras import NN # Default Neural network used
  9 | from tensorflow.keras.optimizers import SGD,RMSprop
 10 | from tensorflow.keras import backend as K
 11 | 
 12 | try:
 13 |     import tensorflow as tf
 14 |     assert(K.backend()=="tensorflow")
 15 | except:
 16 |     print('Error : Currently only Tensorflow is supported as a backend for AC_net_keras. You can make the switch in the file ~/.keras/keras.json')
 17 | 
 18 | class MyACNetwork(ACNetwork):
 19 |     """
 20 |     Actor-critic learning (using Keras) with Deep Deterministic Policy Gradient (DDPG) for the continuous action domain
 21 |     
 22 |     Parameters
 23 |     -----------
 24 |     environment : object from class Environment
 25 |         The environment in which the agent evolves.
 26 |     rho : float
 27 |         Parameter for rmsprop. Default : 0.9
 28 |     rms_epsilon : float
 29 |         Parameter for rmsprop. Default : 0.0001
 30 |     momentum : float
 31 |         Momentum for SGD. Default : 0
 32 |     clip_norm : float
 33 |         The gradient tensor will be clipped to a maximum L2 norm given by this value.
 34 |     freeze_interval : int
 35 |         Period during which the target network is freezed and after which the target network is updated. Default : 1000
 36 |     batch_size : int
 37 |         Number of tuples taken into account for each iteration of gradient descent. Default : 32
 38 |     update_rule: str
 39 |         {sgd,rmsprop}. Default : rmsprop
 40 |     random_state : numpy random number generator
 41 |         Set the random seed.
 42 |     double_Q : bool, optional
 43 |         Activate or not the double_Q learning.
 44 |         More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning.
 45 |     neural_network_critic : object, optional
 46 |         default is deer.learning_algos.NN_keras
 47 |     neural_network_actor : object, optional
 48 |         default is deer.learning_algos.NN_keras
 49 |     """
 50 | 
 51 |     def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_norm=0, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network_critic=NN, neural_network_actor=NN):
 52 |         """ Initialize environment
 53 |         
 54 |         """
 55 |         ACNetwork.__init__(self,environment, batch_size)
 56 | 
 57 |         self._rho = rho
 58 |         self._rms_epsilon = rms_epsilon
 59 |         self._momentum = momentum
 60 |         self._clip_norm = clip_norm
 61 |         self._freeze_interval = freeze_interval
 62 |         self._double_Q = double_Q
 63 |         self._random_state = random_state
 64 |         self._nActions=environment.nActions()
 65 |         self.update_counter = 0
 66 |         
 67 |         # self.sess = tf.Session()
 68 |         # K.set_session(self.sess)
 69 |         
 70 |         Q_net = neural_network_critic(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, True)
 71 |         
 72 |         self.q_vals, self.params, self.inputsQ = Q_net._buildDQN()
 73 |         
 74 |         if (update_rule=="sgd"):
 75 |             optimizer = SGD(lr=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm)
 76 |         elif (update_rule=="rmsprop"):
 77 |             optimizer = RMSprop(lr=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm)
 78 |         else:
 79 |             raise Exception('The update_rule '+update_rule+ 'is not implemented.')
 80 |         
 81 |         self.q_vals.compile(optimizer=optimizer, loss='mse')
 82 |        
 83 |         self.next_q_vals, self.next_params, self.next_inputsQ = Q_net._buildDQN()
 84 |         self.next_q_vals.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.q_vals
 85 | 
 86 |         self._resetQHat()
 87 |         
 88 | 
 89 |         policy_net = neural_network_actor(self._batch_size, self._input_dimensions, self._n_actions, self._random_state, False)
 90 |         self.policy, self.params_policy = policy_net._buildDQN()
 91 |         self.policy.compile(optimizer=optimizer, loss='mse')
 92 |         self.next_policy, self.next_params_policy = policy_net._buildDQN()
 93 |         self.next_policy.compile(optimizer=optimizer, loss='mse')
 94 |         
 95 |         
 96 |         
 97 |         ### self.policy
 98 |         #self.action_grads = tf.gradients(self.q_vals.output,self.inputsQ[-1])  #GRADIENTS for policy update
 99 |        
100 |         
101 |         #self.sess.run(tf.initialize_all_variables())
102 | 
103 | 
104 |     def getAllParams(self):
105 |         """ Get all parameters used by the learning algorithm
106 | 
107 |         Returns
108 |         -------
109 |         Values of the parameters: list of numpy arrays
110 |         """
111 |         params_value=[]
112 |         for i,p in enumerate(self.params):
113 |             params_value.append(K.get_value(p))
114 |         for i,p in enumerate(self.params_policy):
115 |             params_value.append(K.get_value(p))
116 |         
117 |         return params_value
118 | 
119 |     def setAllParams(self, list_of_values):
120 |         """ Set all parameters used by the learning algorithm
121 | 
122 |         Arguments
123 |         ---------
124 |         list_of_values : list of numpy arrays
125 |              list of the parameters to be set (same order than given by getAllParams()).
126 |         """
127 |         for i,p in enumerate(self.params):
128 |             K.set_value(p,list_of_values[i])
129 |         for j,p in enumerate(self.params_policy):
130 |             K.set_value(p,list_of_values[j+i+1])
131 | 
132 |     def train(self, states_val, actions_val, rewards_val, next_states_val, terminals_val):
133 |         """
134 |         Train the actor-critic algorithm from one batch of data.
135 | 
136 |         Parameters
137 |         -----------
138 |         states_val : numpy array of objects
139 |             Each object is a numpy array that relates to one of the observations
140 |             with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
141 |         actions_val : numpy array of objects with size [self._batch_size].
142 |             Each object is a numpy array of floats with size [len(self._nActions)]
143 |             actions[i] is the action taken after having observed states[:][i].
144 |         rewards_val : numpy array of floats with size [self._batch_size]
145 |             rewards[i] is the reward obtained for taking actions[i-1].
146 |         next_states_val : numpy array of objects
147 |             Each object is a numpy array that relates to one of the observations
148 |             with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)]).
149 |         terminals_val : numpy array of booleans with size [self._batch_size]
150 |             terminals[i] is True if the transition leads to a terminal state and False otherwise
151 | 
152 | 
153 |         Returns
154 |         -------
155 |         Average loss of the batch training
156 |         Individual losses for each tuple
157 |         """
158 |         if self.update_counter % self._freeze_interval == 0:
159 |             self._resetQHat()
160 |         
161 | 
162 |         ### Tain self.q_vals
163 |         next_actions_val=self.next_policy.predict(next_states_val.tolist())
164 | 
165 |         ns_list=next_states_val.tolist()
166 |         ns_list.append( next_actions_val )
167 |         next_q_vals = self.next_q_vals.predict(  ns_list  )
168 |         
169 |         not_terminals=np.invert(terminals_val).astype(float)
170 |         
171 |         target = rewards_val + not_terminals * self._df * next_q_vals.reshape((-1))
172 |         
173 |         s_list=states_val.tolist()
174 |         s_list.append( np.array(actions_val.tolist())  )
175 |         
176 |         # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
177 |         q_vals=self.q_vals.predict( s_list ).reshape((-1))
178 |         diff_q = - q_vals + target 
179 |         loss_ind_q=pow(diff_q,2)
180 |         
181 |         loss_q=self.q_vals.train_on_batch( s_list , target ) 
182 |         
183 |         
184 |         ### Train self.policy
185 |         cur_action=self.policy.predict(states_val.tolist())
186 |         cur_action=self.clip_action(cur_action)
187 |         gg=self.gradients(states_val.tolist(),cur_action)
188 |         
189 |         target_action=self.clip_action(cur_action+gg)
190 |         
191 |         # Calculation of the individual losses for the policy network
192 |         diff_policy = - cur_action + target_action
193 |         loss_ind_policy=np.sum(pow(diff_policy,2),axis=-1)
194 | 
195 |         loss_policy=self.policy.train_on_batch(states_val.tolist(), target_action)
196 |                         
197 |         self.update_counter += 1        
198 |         
199 |         
200 |         return loss_q+loss_policy,loss_ind_q+loss_ind_policy
201 | 
202 | 
203 |     def clip_action(self, action):
204 |         """
205 |         Clip the possible actions if it is outside the action space defined by self._nActions
206 |         self._nActions is given as [[low_action1,high_action1],[low_action2,high_action2], ...]
207 |         """
208 |         return np.clip(action,np.array(self._nActions)[:,0],np.array(self._nActions)[:,1])
209 |     
210 | 
211 |     def gradients(self, states, actions):
212 |         """
213 |         Returns the gradients on the Q-network for the different actions (used for policy update)
214 |         """
215 |         # combine state features with action
216 |         input_list = states.copy()
217 |         input_list.append(actions)
218 | 
219 |         # inputs need to be tf.Variable to calculate gradients
220 |         input_list = [tf.Variable(input, dtype=tf.float32) for input in input_list]
221 | 
222 |         with tf.GradientTape() as tape:
223 |             q_vals = self.q_vals(input_list)
224 | 
225 |         grads = tape.gradient(q_vals, input_list)
226 | 
227 |         #last entry in grads corresponds to the gradients of the q_vals with respect to the action
228 |         out = grads[-1].numpy()
229 | 
230 |         return out
231 | 
232 |     def chooseBestAction(self, state, *args, **kwargs):
233 |         """ Get the best action for a pseudo-state
234 | 
235 |         Arguments
236 |         ---------
237 |         state : one pseudo-state
238 | 
239 |         Returns
240 |         -------
241 |         best_action : float
242 |         estim_value : float
243 |         """        
244 |         
245 |         best_action=self.policy.predict([np.expand_dims(s,axis=0) for s in state])
246 |         best_action=self.clip_action(best_action)
247 |         
248 |         the_list=[np.expand_dims(s,axis=0) for s in state]
249 |         the_list.append( best_action )
250 |         estim_value=(self.q_vals.predict(the_list)[0,0])
251 |         
252 |         return best_action[0],estim_value
253 |         
254 |     def _resetQHat(self):
255 |         for i,(param,next_param) in enumerate(zip(self.params, self.next_params)):
256 |             K.set_value(next_param,K.get_value(param))
257 | 


--------------------------------------------------------------------------------
/deer/learning_algos/NN_keras.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Neural network using Keras (called by q_net_keras)
  3 | 
  4 | """
  5 | 
  6 | import numpy as np
  7 | from tensorflow.keras.models import Model
  8 | from tensorflow.keras.layers import Input, Layer, Dense, Flatten, concatenate, Activation, Conv2D, MaxPooling2D, Reshape, Permute
  9 | 
 10 | class NN():
 11 |     """
 12 |     Deep Q-learning network using Keras
 13 |     
 14 |     Parameters
 15 |     -----------
 16 |     batch_size : int
 17 |         Number of tuples taken into account for each iteration of gradient descent
 18 |     input_dimensions :
 19 |     n_actions :
 20 |     random_state : numpy random number generator
 21 |         Set the random seed.
 22 |     action_as_input : Boolean
 23 |         Whether the action is given as input or as output
 24 |     """
 25 |     def __init__(self, batch_size, input_dimensions, n_actions, random_state, action_as_input=False):
 26 |         self._input_dimensions=input_dimensions
 27 |         self._batch_size=batch_size
 28 |         self._random_state=random_state
 29 |         self._n_actions=n_actions
 30 |         self._action_as_input=action_as_input
 31 | 
 32 |     def _buildDQN(self):
 33 |         """
 34 |         Build a network consistent with each type of inputs
 35 |         """
 36 |         layers=[]
 37 |         outs_conv=[]
 38 |         inputs=[]
 39 | 
 40 |         for i, dim in enumerate(self._input_dimensions):
 41 |             # - observation[i] is a FRAME
 42 |             if len(dim) == 3 or len(dim) == 4:
 43 |                 if(len(dim) == 4):
 44 |                     input = Input(shape=(dim[-4],dim[-3],dim[-2],dim[-1]))
 45 |                     inputs.append(input)
 46 |                     input = Reshape((dim[-4]*dim[-3],dim[-2],dim[-1]), input_shape=(dim[-4],dim[-3],dim[-2],dim[-1]))(input)
 47 |                     x=Permute((2,3,1), input_shape=(dim[-4]*dim[-3],dim[-2],dim[-1]))(input)    #data_format='channels_last'
 48 |                 else:
 49 |                     input = Input(shape=(dim[-3],dim[-2],dim[-1]))
 50 |                     inputs.append(input)
 51 |                     x=Permute((2,3,1), input_shape=(dim[-3],dim[-2],dim[-1]))(input)    #data_format='channels_last'
 52 |                 x = Conv2D(8, (4, 4), activation='relu', padding='valid')(x)   #Conv on the frames
 53 |                 x = Conv2D(16, (3, 3), activation='relu', padding='valid')(x)         #Conv on the frames
 54 |                 x = MaxPooling2D(pool_size=(2, 2), strides=None, padding='valid')(x)
 55 |                 x = Conv2D(16, (3, 3), activation='relu', padding='valid')(x)         #Conv on the frames
 56 |                 
 57 |                 out = Flatten()(x)
 58 |                 
 59 |             # - observation[i] is a VECTOR
 60 |             elif len(dim) == 2:
 61 |                 if dim[0] > 3:
 62 |                     input = Input(shape=(dim[0],dim[1]))
 63 |                     inputs.append(input)
 64 |                     reshaped=Reshape((dim[0],dim[1],1), input_shape=(dim[0],dim[1]))(input) 
 65 |                     x = Conv2D(16, (2, 1), activation='relu', padding='valid')(reshaped)#Conv on the history
 66 |                     x = Conv2D(16, (2, 1), activation='relu', padding='valid')(x)       #Conv on the history & features
 67 | 
 68 |                     out = Flatten()(x)
 69 |                 else:
 70 |                     input = Input(shape=(dim[0],dim[1]))
 71 |                     inputs.append(input)
 72 |                     out = Flatten()(input)
 73 | 
 74 |             # - observation[i] is a SCALAR -
 75 |             else:
 76 |                 if dim[0] > 3:
 77 |                     # this returns a tensor
 78 |                     input = Input(shape=(dim[0],))
 79 |                     inputs.append(input)
 80 |                     reshaped=Reshape((1,dim[0],1), input_shape=(dim[0],))(input)  
 81 |                     x = Conv2D(8, (1,2), activation='relu', padding='valid')(reshaped)  #Conv on the history
 82 |                     x = Conv2D(8, (1,2), activation='relu', padding='valid')(x)         #Conv on the history
 83 |                     
 84 |                     out = Flatten()(x)
 85 |                                         
 86 |                 else:
 87 |                     input = Input(shape=(dim[0],))
 88 |                     inputs.append(input)
 89 |                     out=input
 90 |                     
 91 |             outs_conv.append(out)
 92 | 
 93 |         if (self._action_as_input==True):
 94 |             if ( isinstance(self._n_actions,int)):
 95 |                 print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
 96 |             else:
 97 |                 input = Input(shape=(len(self._n_actions),))
 98 |                 inputs.append(input)
 99 |                 outs_conv.append(input)
100 |         
101 |         if len(outs_conv)>1:
102 |             x = concatenate(outs_conv)
103 |         else:
104 |             x= outs_conv [0]
105 |         
106 |         # we stack a deep fully-connected network on top
107 |         x = Dense(50, activation='relu')(x)
108 |         x = Dense(20, activation='relu')(x)
109 |         
110 |         if (self._action_as_input==False):
111 |             if ( isinstance(self._n_actions,int)):
112 |                 out = Dense(self._n_actions)(x)
113 |             else:
114 |                 out = Dense(len(self._n_actions))(x)
115 |         else:
116 |             out = Dense(1)(x)
117 | 
118 |         model = Model(inputs=inputs, outputs=out)
119 |         layers=model.layers
120 |         
121 |         # Grab all the parameters together.
122 |         params = [ param
123 |                     for layer in layers 
124 |                     for param in layer.trainable_weights ]
125 |         
126 |         if (self._action_as_input==True):
127 |             return model, params, inputs
128 |         else:
129 |             return model, params
130 | 
131 | if __name__ == '__main__':
132 |     pass
133 |     
134 | 


--------------------------------------------------------------------------------
/deer/learning_algos/NN_keras_LSTM.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Neural network with LSTM's using Keras (called by q_net_keras)
  3 | 
  4 | """
  5 | 
  6 | import numpy as np
  7 | from tensorflow.keras.models import Model
  8 | from tensorflow.keras.layers import Input, Layer, Dense, Flatten, concatenate, Activation, Convolution2D, MaxPooling2D, Reshape
  9 | from tensorflow.keras.layers.recurrent import LSTM
 10 | 
 11 | class NN():
 12 |     """
 13 |     Deep Q-learning network with LSTM's using Keras
 14 |     
 15 |     Parameters
 16 |     -----------
 17 |     batch_size : int
 18 |         Number of tuples taken into account for each iteration of gradient descent
 19 |     input_dimensions : tuples
 20 |     n_actions : int
 21 |     random_state : numpy random number generator
 22 |         Set the random seed.
 23 |     action_as_input : Boolean
 24 |         Whether the action is given as input or as output
 25 |     """
 26 |     def __init__(self, batch_size, input_dimensions, n_actions, random_state, action_as_input=False):
 27 |         self._input_dimensions=input_dimensions
 28 |         self._batch_size=batch_size
 29 |         self._random_state=random_state
 30 |         self._n_actions=n_actions
 31 |         self._action_as_input=action_as_input
 32 | 
 33 |     def _buildDQN(self):
 34 |         """
 35 |         Build a network consistent with each type of inputs
 36 |         """
 37 |         layers=[]
 38 |         outs_conv=[]
 39 |         inputs=[]
 40 | 
 41 |         for i, dim in enumerate(self._input_dimensions):
 42 |             # - observation[i] is a FRAME
 43 |             if len(dim) == 3:
 44 |                 input = Input(shape=(dim[0],dim[1],dim[2]))
 45 |                 inputs.append(input)
 46 |                 x = Convolution2D(32, 8, 8, border_mode='valid')(input)
 47 |                 x = MaxPooling2D(pool_size=(4, 4), strides=None, border_mode='valid')(x)
 48 |                 x = Convolution2D(64, 4, 4, border_mode='valid')(x)
 49 |                 x = MaxPooling2D(pool_size=(2, 2), strides=None, border_mode='valid')(x)
 50 |                 x = Convolution2D(64, 3, 3)(x)
 51 |                 
 52 |                 # We may add here LSTM's after having flatten the last two dimensions
 53 |                 
 54 |                 x = Flatten()(x)           
 55 | 
 56 |             # - observation[i] is a VECTOR
 57 |             if len(dim) == 2:
 58 |                 input = Input(shape=(dim[0],dim[1]))
 59 |                 inputs.append(input)
 60 |                 
 61 |                 if dim[0] > 3:
 62 | 
 63 |                     x = LSTM(16,
 64 |                         activation='relu',
 65 |                         return_sequences=True)(input)
 66 |                     x = LSTM(16,
 67 |                         activation='relu',
 68 |                         return_sequences=False)(x) # Structure many-to-one
 69 | 
 70 |                 else:
 71 |                     x=input
 72 |                     x = Flatten()(x)
 73 | 
 74 |             # - observation[i] is a SCALAR -
 75 |             elif(len(dim) == 1):
 76 |             
 77 |                 input = Input(shape=(dim[0],))
 78 |                 inputs.append(input)
 79 |                 input = Reshape((dim[0],1))(input)
 80 |                 
 81 |                 if dim[0] > 3:                    
 82 |                     x = LSTM(8,
 83 |                             activation='relu',
 84 |                             return_sequences=True)(input)
 85 |                     x = LSTM(8,
 86 |                             activation='relu',
 87 |                             return_sequences=False)(x) # Structure many-to-one
 88 |                 else:
 89 |                     x=input
 90 |                     x = Flatten()(x)
 91 |                             
 92 |             outs_conv.append(x)
 93 |         
 94 |         if (self._action_as_input==True):
 95 |             if ( isinstance(self._n_actions,int)):
 96 |                 print("Error, env.nActions() must be a continuous set when using actions as inputs in the NN")
 97 |             else:
 98 |                 input = Input(shape=(len(self._n_actions),))
 99 |                 inputs.append(input)
100 |                 outs_conv.append(input)
101 | 
102 |         if len(outs_conv)>1:
103 |             x = concatenate(outs_conv)
104 |         else:
105 |             x= outs_conv [0]
106 |                     
107 |         x = Dense(50, activation='relu')(x)
108 |         x = Dense(20, activation='relu')(x)
109 | 
110 |         if (self._action_as_input==False):
111 |             if ( isinstance(self._n_actions,int)):
112 |                 out = Dense(self._n_actions)(x)
113 |             else:
114 |                 out = Dense(len(self._n_actions))(x)
115 |         else:
116 |             out = Dense(1)(x)
117 | 
118 |         model = Model(inputs=inputs, outputs=out)
119 |         layers=model.layers
120 | 
121 |         # Grab all the parameters together.
122 |         params = [ param
123 |                     for layer in layers 
124 |                     for param in layer.trainable_weights ]
125 | 
126 |         if (self._action_as_input==True):
127 |             return model, params, inputs
128 |         else:
129 |             return model, params
130 | 
131 | if __name__ == '__main__':
132 |     pass
133 | 
134 | 


--------------------------------------------------------------------------------
/deer/learning_algos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/deer/learning_algos/__init__.py


--------------------------------------------------------------------------------
/deer/learning_algos/q_net_keras.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Code for general deep Q-learning using Keras that can take as inputs scalars, vectors and matrices
  3 | 
  4 | .. Author: Vincent Francois-Lavet
  5 | """
  6 | 
  7 | import numpy as np
  8 | from tensorflow.keras.optimizers import SGD,RMSprop
  9 | from tensorflow.keras import backend as K
 10 | from ..base_classes import LearningAlgo as QNetwork
 11 | from .NN_keras import NN # Default Neural network used
 12 | import gc
 13 | 
 14 | class MyQNetwork(QNetwork):
 15 |     """
 16 |     Deep Q-learning network using Keras (with any backend)
 17 |     
 18 |     Parameters
 19 |     -----------
 20 |     environment : object from class Environment
 21 |         The environment in which the agent evolves.
 22 |     rho : float
 23 |         Parameter for rmsprop. Default : 0.9
 24 |     rms_epsilon : float
 25 |         Parameter for rmsprop. Default : 0.0001
 26 |     momentum : float
 27 |         Momentum for SGD. Default : 0
 28 |     clip_norm : float
 29 |         The gradient tensor will be clipped to a maximum L2 norm given by this value.
 30 |     freeze_interval : int
 31 |         Period during which the target network is freezed and after which the target network is updated. Default : 1000
 32 |     batch_size : int
 33 |         Number of tuples taken into account for each iteration of gradient descent. Default : 32
 34 |     update_rule: str
 35 |         {sgd,rmsprop}. Default : rmsprop
 36 |     random_state : numpy random number generator
 37 |     double_Q : bool, optional
 38 |         Activate or not the double_Q learning.
 39 |         More informations in : Hado van Hasselt et al. (2015) - Deep Reinforcement Learning with Double Q-learning.
 40 |     neural_network : object, optional
 41 |         default is deer.learning_algos.NN_keras
 42 |     """
 43 | 
 44 |     def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_norm=1, freeze_interval=1000, batch_size=32, update_rule="rmsprop", random_state=np.random.RandomState(), double_Q=False, neural_network=NN):
 45 |         """ Initialize environment
 46 |         
 47 |         """
 48 |         QNetwork.__init__(self,environment, batch_size)
 49 | 
 50 |         
 51 |         self._rho = rho
 52 |         self._rms_epsilon = rms_epsilon
 53 |         self._momentum = momentum
 54 |         self._clip_norm = clip_norm
 55 |         self._update_rule = update_rule
 56 |         self._freeze_interval = freeze_interval
 57 |         self._double_Q = double_Q
 58 |         self._random_state = random_state
 59 |         self.update_counter = 0
 60 |                 
 61 |         Q_net = neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state)
 62 |         self.q_vals, self.params = Q_net._buildDQN()
 63 |         
 64 |         self._compile()
 65 | 
 66 |         self.next_q_vals, self.next_params = Q_net._buildDQN()
 67 |         self.next_q_vals.compile(optimizer='rmsprop', loss='mse') #The parameters do not matter since training is done on self.q_vals
 68 | 
 69 |         self._resetQHat()
 70 | 
 71 |     def getAllParams(self):
 72 |         """ Get all parameters used by the learning algorithm
 73 | 
 74 |         Returns
 75 |         -------
 76 |         Values of the parameters: list of numpy arrays
 77 |         """
 78 |         params_value=[]
 79 |         for i,p in enumerate(self.params):
 80 |             params_value.append(K.get_value(p))
 81 |         return params_value
 82 | 
 83 |     def setAllParams(self, list_of_values):
 84 |         """ Set all parameters used by the learning algorithm
 85 | 
 86 |         Arguments
 87 |         ---------
 88 |         list_of_values : list of numpy arrays
 89 |              list of the parameters to be set (same order than given by getAllParams()).
 90 |         """
 91 |         for i,p in enumerate(self.params):
 92 |             K.set_value(p,list_of_values[i])
 93 | 
 94 |     def train(self, states_val, actions_val, rewards_val, next_states_val, terminals_val):
 95 |         """
 96 |         Train the Q-network from one batch of data.
 97 | 
 98 |         Parameters
 99 |         -----------
100 |         states_val : numpy array of objects
101 |             Each object is a numpy array that relates to one of the observations
102 |             with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)].
103 |         actions_val : numpy array of integers with size [self._batch_size]
104 |             actions[i] is the action taken after having observed states[:][i].
105 |         rewards_val : numpy array of floats with size [self._batch_size]
106 |             rewards[i] is the reward obtained for taking actions[i-1].
107 |         next_states_val : numpy array of objects
108 |             Each object is a numpy array that relates to one of the observations
109 |             with size [batch_size * history size * size of punctual observation (which is 2D,1D or scalar)].
110 |         terminals_val : numpy array of booleans with size [self._batch_size]
111 |             terminals[i] is True if the transition leads to a terminal state and False otherwise
112 | 
113 |         Returns
114 |         -------
115 |         Average loss of the batch training (RMSE)
116 |         Individual (square) losses for each tuple
117 |         """
118 |         
119 |         if self.update_counter % self._freeze_interval == 0:
120 |             self._resetQHat()
121 |         
122 |         next_q_vals = self.next_q_vals.predict(next_states_val.tolist(), verbose=0)
123 |         
124 |         if(self._double_Q==True):
125 |             next_q_vals_current_qnet=self.q_vals.predict(next_states_val.tolist(), verbose=0)
126 |             argmax_next_q_vals=np.argmax(next_q_vals_current_qnet, axis=1)
127 |             max_next_q_vals=next_q_vals[np.arange(self._batch_size),argmax_next_q_vals].reshape((-1, 1))
128 |         else:
129 |             max_next_q_vals=np.max(next_q_vals, axis=1, keepdims=True)
130 | 
131 |         not_terminals=np.invert(terminals_val).astype(float)
132 |         
133 |         target = rewards_val + not_terminals * self._df * max_next_q_vals.reshape((-1))
134 |         
135 |         q_vals=self.q_vals.predict(states_val.tolist(), verbose=0)
136 | 
137 |         # In order to obtain the individual losses, we predict the current Q_vals and calculate the diff
138 |         q_val=q_vals[np.arange(self._batch_size), actions_val]       
139 |         diff = - q_val + target 
140 |         loss_ind=pow(diff,2)
141 |                 
142 |         q_vals[  np.arange(self._batch_size), actions_val  ] = target
143 |                 
144 |         # Is it possible to use something more flexible than this? 
145 |         # Only some elements of next_q_vals are actual value that I target. 
146 |         # My loss should only take these into account.
147 |         # Workaround here is that many values are already "exact" in this update
148 |         loss=self.q_vals.train_on_batch(states_val.tolist() , q_vals ) 
149 |                 
150 |         self.update_counter += 1        
151 |         
152 |         gc.collect() #Clearing potential unused memory to avoid any memory leak
153 |         
154 |         # loss*self._n_actions = np.average(loss_ind)
155 |         return np.sqrt(loss),loss_ind
156 | 
157 | 
158 |     def qValues(self, state_val):
159 |         """ Get the q values for one belief state
160 | 
161 |         Arguments
162 |         ---------
163 |         state_val : one belief state
164 | 
165 |         Returns
166 |         -------
167 |         The q values for the provided belief state
168 |         """ 
169 |         q_vals_pred=self.q_vals.predict([np.expand_dims(state,axis=0) for state in state_val], verbose=0)[0]
170 |         
171 |         return q_vals_pred
172 | 
173 |     def chooseBestAction(self, state, *args, **kwargs):
174 |         """ Get the best action for a pseudo-state
175 | 
176 |         Arguments
177 |         ---------
178 |         state : one pseudo-state
179 | 
180 |         Returns
181 |         -------
182 |         The best action : int
183 |         """        
184 |         q_vals = self.qValues(state)
185 |         
186 |         action_to_take=np.argmax(q_vals)
187 |         corresponding_q_val=np.max(q_vals)
188 |         gc.collect() #Clearing potential unused memory to avoid any memory leak
189 | 
190 |         return action_to_take,corresponding_q_val
191 |         
192 |     def _compile(self):
193 |         """ Compile self.q_vals
194 |         """
195 |         
196 |         if (self._update_rule=="sgd"):
197 |             optimizer = SGD(learning_rate=self._lr, momentum=self._momentum, nesterov=False, clipnorm=self._clip_norm)
198 |         elif (self._update_rule=="rmsprop"):
199 |             optimizer = RMSprop(learning_rate=self._lr, rho=self._rho, epsilon=self._rms_epsilon, clipnorm=self._clip_norm)
200 |         else:
201 |             raise Exception('The update_rule '+self._update_rule+' is not implemented.')
202 |         
203 |         self.q_vals.compile(optimizer=optimizer, loss='mse')
204 |         
205 | 
206 |     def _resetQHat(self):
207 |         """ Set the target Q-network weights equal to the main Q-network weights
208 |         """
209 |         
210 |         for i,(param,next_param) in enumerate(zip(self.params, self.next_params)):
211 |             K.set_value(next_param,K.get_value(param))
212 | 
213 |         self._compile() # recompile to take into account new optimizer parameters that may have changed since
214 |                         # self._compile() was called in __init__. FIXME: this call should ideally be done elsewhere
215 |                         # Not ideal to recompile everytime we change e.g. only the lr
216 | 
217 | 


--------------------------------------------------------------------------------
/deer/policies/EpsilonGreedyPolicy.py:
--------------------------------------------------------------------------------
 1 | from ..base_classes import Policy
 2 | 
 3 | 
 4 | class EpsilonGreedyPolicy(Policy):
 5 |     """The policy acts greedily with probability :math:`1-\epsilon` and acts randomly otherwise.
 6 |     It is now used as a default policy for the neural agent.
 7 | 
 8 |     Parameters
 9 |     -----------
10 |     epsilon : float
11 |         Proportion of random steps
12 |     """
13 |     def __init__(self, learning_algo, n_actions, random_state, epsilon):
14 |         Policy.__init__(self, learning_algo, n_actions, random_state)
15 |         self._epsilon = epsilon
16 | 
17 |     def action(self, state, mode=None, *args, **kwargs):
18 |         if self.random_state.rand() < self._epsilon:
19 |             action, V = self.randomAction()
20 |         else:
21 |             action, V = self.bestAction(state, mode, *args, **kwargs)
22 | 
23 |         return action, V
24 | 
25 |     def setEpsilon(self, e):
26 |         """ Set the epsilon used for :math:`\epsilon`-greedy exploration
27 |         """
28 |         self._epsilon = e
29 | 
30 |     def epsilon(self):
31 |         """ Get the epsilon for :math:`\epsilon`-greedy exploration
32 |         """
33 |         return self._epsilon
34 | 


--------------------------------------------------------------------------------
/deer/policies/LongerExplorationPolicy.py:
--------------------------------------------------------------------------------
 1 | """ Exploration policy for permutation invariant environments
 2 | 
 3 | """
 4 | 
 5 | from ..base_classes import Policy
 6 | import itertools
 7 | import random
 8 | import copy
 9 | import numpy as np
10 | 
11 | class LongerExplorationPolicy(Policy):
12 |     """Simple alternative to :math:`\epsilon`-greedy that can explore more
13 |     efficiently for a broad class of realistic problems.
14 | 
15 |     Parameters
16 |     -----------
17 |     epsilon : float
18 |         Proportion of random steps
19 |     length : int
20 |         Length of the exploration sequences that will be considered
21 |     """
22 |     def __init__(self, learning_algo, n_actions, random_state, epsilon, length=10):
23 |         Policy.__init__(self, learning_algo, n_actions, random_state)
24 |         self._epsilon = epsilon
25 |         self._l = length
26 |         self._count_down = -1
27 |         self._action_sequence = []
28 | 
29 |     def action(self, state, mode=None, *args, **kwargs):
30 |         if self._count_down >= 0:
31 |             # Take the next exploration action in the sequence
32 |             V = 0
33 |             action = self._action_sequence[self._count_down]
34 |             self._count_down -= 1
35 |         else:
36 |             if self.random_state.rand() < self._epsilon/((1+(self._l-1)*(1-self._epsilon))):
37 |                 # Take a random action and build an exploration sequence for the next steps
38 |                 self._count_down = self._l - 1
39 |                 self._action_sequence = self.sampleUniformActionSequence()
40 |                 action = self._action_sequence[self._count_down]
41 |                 V = 0
42 |                 self._count_down -= 1
43 |             else:
44 |                 # Simply act greedily with respect to what is currently believed to be the best action
45 |                 action, V = self.bestAction(state, mode, args, kwargs)
46 |         
47 |         return np.array(action), V
48 | 
49 |     def setEpsilon(self, e):
50 |         """ Set the epsilon
51 |         """
52 |         self._epsilon = e
53 | 
54 |     def epsilon(self):
55 |         """ Get the epsilon
56 |         """
57 |         return self._epsilon
58 | 
59 |     def sampleUniformActionSequence(self):
60 |         if ( isinstance(self.n_actions,int)):
61 |             """ Sample an action sequence of length self._l, where the unordered sequences have uniform probabilities"""
62 |             actions_list = range(self.n_actions)
63 |         else:   
64 |             """For N exploration steps, the goal is to have actions such that their sum spans quite uniformly 
65 |             the whole range of possibilities. Among those possibilities, random choice/order of actions. """
66 |             
67 |             possible_actions=[]
68 |             # Add for all actions N random element between min and max
69 |             N=3
70 |             for i,a in enumerate(self.n_actions):
71 |                 possible_actions.append([])
72 |                 for j in range(N):
73 |                     possible_actions[i].append( self.random_state.uniform(self.n_actions[i][0],self.n_actions[i][1]) )
74 |             actions_list = list(itertools.product(*possible_actions))
75 |             
76 |         sequences_with_replacement = list(itertools.combinations_with_replacement(actions_list, self._l))
77 |         index_pick = self.random_state.randint(0, len(sequences_with_replacement))
78 |         sequence = list(sequences_with_replacement[index_pick])
79 |         self.random_state.shuffle(sequence)
80 |         
81 |         return sequence
82 | 


--------------------------------------------------------------------------------
/deer/policies/__init__.py:
--------------------------------------------------------------------------------
1 | from .EpsilonGreedyPolicy import EpsilonGreedyPolicy
2 | from .LongerExplorationPolicy import LongerExplorationPolicy


--------------------------------------------------------------------------------
/deer/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/deer/tests/__init__.py


--------------------------------------------------------------------------------
/deer/tests/test_base.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | # Example of tests
 4 | class TestStringMethods(unittest.TestCase):
 5 | 
 6 |   def test_upper(self):
 7 |       self.assertEqual('foo'.upper(), 'FOO')
 8 | 
 9 |   def test_isupper(self):
10 |       self.assertTrue('FOO'.isupper())
11 |       self.assertFalse('Foo'.isupper())
12 | 
13 |   def test_split(self):
14 |       s = 'hello world'
15 |       self.assertEqual(s.split(), ['hello', 'world'])
16 |       # check that s.split fails when the separator is not a string
17 |       with self.assertRaises(TypeError):
18 |           s.split(2)
19 | 
20 | if __name__ == '__main__':
21 |     unittest.main()


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/deer.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/deer.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/deer"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/deer"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | try:
  3 |     from unittest.mock import Mock
  4 | except ImportError:
  5 |     from mock import Mock
  6 | 
  7 | #class Mock(MagicMock):
  8 | #    @classmethod
  9 | #    def __getattr__(cls, name):
 10 | #            return Mock()
 11 | #
 12 | #MOCK_MODULES = ['numpy', 'scipy', 'matplotlib', 'matplotlib.pyplot', 'scipy.interpolate', 'theano', 'theano.tensor', 'joblib', 'lasagne']
 13 | #sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 14 | 
 15 | sys.modules['pylearn2'] = Mock()
 16 | sys.modules['pylearn2.sandbox'] = Mock()
 17 | sys.modules['pylearn2.sandbox.cuda_convnet'] = Mock()
 18 | sys.modules['pylearn2.sandbox.cuda_convnet.filter_acts'] = \
 19 |     Mock(FilterActs=None)
 20 | sys.modules['scipy'] = Mock()
 21 | sys.modules['theano'] = Mock()
 22 | sys.modules['theano.tensor'] = Mock()
 23 | sys.modules['theano.tensor.signal'] = Mock()
 24 | sys.modules['theano.tensor.nnet'] = Mock()
 25 | sys.modules['joblib'] = Mock()
 26 | sys.modules['lasagne'] = Mock()
 27 | sys.modules['lasagne.updates'] = Mock()
 28 | sys.modules['keras.models'] = Mock()
 29 | sys.modules['keras.layers'] = Mock()
 30 | sys.modules['keras.optimizers'] = Mock()
 31 | sys.modules['keras.backend'] = Mock()
 32 | sys.modules['keras'] = Mock()
 33 | 
 34 | 
 35 | # -*- coding: utf-8 -*-
 36 | #
 37 | # deer documentation build configuration file, created by
 38 | # sphinx-quickstart on Wed Apr  6 16:38:40 2016.
 39 | #
 40 | # This file is execfile()d with the current directory set to its
 41 | # containing dir.
 42 | #
 43 | # Note that not all possible configuration values are present in this
 44 | # autogenerated file.
 45 | #
 46 | # All configuration values have a default; values that are commented out
 47 | # serve to show the default.
 48 | 
 49 | import sys
 50 | import os
 51 | 
 52 | # If extensions (or modules to document with autodoc) are in another directory,
 53 | # add these directories to sys.path here. If the directory is relative to the
 54 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 55 | sys.path.insert(0, os.path.abspath('.'))
 56 | 
 57 | # -- General configuration ------------------------------------------------
 58 | 
 59 | # If your documentation needs a minimal Sphinx version, state it here.
 60 | #needs_sphinx = '1.0'
 61 | 
 62 | # Add any Sphinx extension module names here, as strings. They can be
 63 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 64 | # ones.
 65 | extensions = [
 66 |     'sphinx.ext.autodoc',
 67 |     'sphinx.ext.autosummary',
 68 |     'sphinx.ext.doctest',
 69 |     'sphinx.ext.mathjax',
 70 | #    'sphinx.ext.viewcode',  # create HTML file of source code and link to it
 71 | #    'sphinx.ext.linkcode',  # link to github, see linkcode_resolve() below
 72 | ##    'numpydoc',               # !Generates unwanted tables with autoclass!
 73 | #    'sphinx.ext.napoleon',  # alternative to numpydoc -- looks a bit worse.
 74 | ]
 75 | 
 76 | 
 77 | # Add any paths that contain templates here, relative to this directory.
 78 | templates_path = ['_templates']
 79 | 
 80 | # The suffix of source filenames.
 81 | source_suffix = '.rst'
 82 | 
 83 | # The encoding of source files.
 84 | #source_encoding = 'utf-8-sig'
 85 | 
 86 | # The master toctree document.
 87 | master_doc = 'index'
 88 | 
 89 | # General information about the project.
 90 | project = u'deer'
 91 | copyright = u'2016, deer contributors'
 92 | 
 93 | # The version info for the project you're documenting, acts as replacement for
 94 | # |version| and |release|, also used in various other places throughout the
 95 | # built documents.
 96 | #
 97 | # The short X.Y version.
 98 | version = '0.4.3'
 99 | # The full version, including alpha/beta/rc tags.
100 | release = '0.4.3'
101 | 
102 | # The language for content autogenerated by Sphinx. Refer to documentation
103 | # for a list of supported languages.
104 | #language = None
105 | 
106 | # There are two options for replacing |today|: either, you set today to some
107 | # non-false value, then it is used:
108 | #today = ''
109 | # Else, today_fmt is used as the format for a strftime call.
110 | #today_fmt = '%B %d, %Y'
111 | 
112 | # List of patterns, relative to source directory, that match files and
113 | # directories to ignore when looking for source files.
114 | exclude_patterns = ['_build']
115 | 
116 | # The reST default role (used for this markup: `text`) to use for all
117 | # documents.
118 | #default_role = None
119 | 
120 | # If true, '()' will be appended to :func: etc. cross-reference text.
121 | #add_function_parentheses = True
122 | 
123 | # If true, the current module name will be prepended to all description
124 | # unit titles (such as .. function::).
125 | #add_module_names = True
126 | 
127 | # If true, sectionauthor and moduleauthor directives will be shown in the
128 | # output. They are ignored by default.
129 | #show_authors = False
130 | 
131 | # The name of the Pygments (syntax highlighting) style to use.
132 | pygments_style = 'sphinx'
133 | 
134 | # A list of ignored prefixes for module index sorting.
135 | #modindex_common_prefix = []
136 | 
137 | # If true, keep warnings as "system message" paragraphs in the built documents.
138 | #keep_warnings = False
139 | 
140 | 
141 | # -- Options for HTML output ----------------------------------------------
142 | 
143 | # The theme to use for HTML and HTML Help pages.  See the documentation for
144 | # a list of builtin themes.
145 | #html_theme = 'default'
146 | ### Read the docs style:
147 | if os.environ.get('READTHEDOCS') != 'True':
148 |     try:
149 |         import sphinx_rtd_theme
150 |     except ImportError:
151 |         pass  # assume we have sphinx >= 1.3
152 |     else:
153 |         html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
154 |     html_theme = 'sphinx_rtd_theme'
155 | def setup(app):
156 |     app.add_stylesheet("fix_rtd.css")
157 | 
158 | 
159 | # Theme options are theme-specific and customize the look and feel of a theme
160 | # further.  For a list of options available for each theme, see the
161 | # documentation.
162 | #html_theme_options = {}
163 | 
164 | # Add any paths that contain custom themes here, relative to this directory.
165 | #html_theme_path = []
166 | 
167 | # The name for this set of Sphinx documents.  If None, it defaults to
168 | # "<project> v<release> documentation".
169 | #html_title = None
170 | 
171 | # A shorter title for the navigation bar.  Default is the same as html_title.
172 | #html_short_title = None
173 | 
174 | # The name of an image file (relative to this directory) to place at the top
175 | # of the sidebar.
176 | #html_logo = None
177 | 
178 | # The name of an image file (within the static path) to use as favicon of the
179 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
180 | # pixels large.
181 | #html_favicon = None
182 | 
183 | # Add any paths that contain custom static files (such as style sheets) here,
184 | # relative to this directory. They are copied after the builtin static files,
185 | # so a file named "default.css" will overwrite the builtin "default.css".
186 | html_static_path = ['_static']
187 | 
188 | # Add any extra paths that contain custom files (such as robots.txt or
189 | # .htaccess) here, relative to this directory. These files are copied
190 | # directly to the root of the documentation.
191 | #html_extra_path = []
192 | 
193 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
194 | # using the given strftime format.
195 | #html_last_updated_fmt = '%b %d, %Y'
196 | 
197 | # If true, SmartyPants will be used to convert quotes and dashes to
198 | # typographically correct entities.
199 | #html_use_smartypants = True
200 | 
201 | # Custom sidebar templates, maps document names to template names.
202 | #html_sidebars = {}
203 | 
204 | # Additional templates that should be rendered to pages, maps page names to
205 | # template names.
206 | #html_additional_pages = {}
207 | 
208 | # If false, no module index is generated.
209 | #html_domain_indices = True
210 | 
211 | # If false, no index is generated.
212 | #html_use_index = True
213 | 
214 | # If true, the index is split into individual pages for each letter.
215 | #html_split_index = False
216 | 
217 | # If true, links to the reST sources are added to the pages.
218 | #html_show_sourcelink = True
219 | 
220 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
221 | #html_show_sphinx = True
222 | 
223 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
224 | #html_show_copyright = True
225 | 
226 | # If true, an OpenSearch description file will be output, and all pages will
227 | # contain a <link> tag referring to it.  The value of this option must be the
228 | # base URL from which the finished HTML is served.
229 | #html_use_opensearch = ''
230 | 
231 | # This is the file name suffix for HTML files (e.g. ".xhtml").
232 | #html_file_suffix = None
233 | 
234 | # Output file base name for HTML help builder.
235 | htmlhelp_basename = 'deerdoc'
236 | 
237 | 
238 | # -- Options for LaTeX output ---------------------------------------------
239 | 
240 | latex_elements = {
241 | # The paper size ('letterpaper' or 'a4paper').
242 | #'papersize': 'letterpaper',
243 | 
244 | # The font size ('10pt', '11pt' or '12pt').
245 | #'pointsize': '10pt',
246 | 
247 | # Additional stuff for the LaTeX preamble.
248 | #'preamble': '',
249 | }
250 | 
251 | # Grouping the document tree into LaTeX files. List of tuples
252 | # (source start file, target name, title,
253 | #  author, documentclass [howto, manual, or own class]).
254 | latex_documents = [
255 |   ('index', 'deer.tex', u'deer Documentation',
256 |    u'deer contributors', 'manual'),
257 | ]
258 | 
259 | # The name of an image file (relative to this directory) to place at the top of
260 | # the title page.
261 | #latex_logo = None
262 | 
263 | # For "manual" documents, if this is true, then toplevel headings are parts,
264 | # not chapters.
265 | #latex_use_parts = False
266 | 
267 | # If true, show page references after internal links.
268 | #latex_show_pagerefs = False
269 | 
270 | # If true, show URL addresses after external links.
271 | #latex_show_urls = False
272 | 
273 | # Documents to append as an appendix to all manuals.
274 | #latex_appendices = []
275 | 
276 | # If false, no module index is generated.
277 | #latex_domain_indices = True
278 | 
279 | 
280 | # -- Options for manual page output ---------------------------------------
281 | 
282 | # One entry per manual page. List of tuples
283 | # (source start file, name, description, authors, manual section).
284 | man_pages = [
285 |     ('index', 'deer', u'deer Documentation',
286 |      [u'Vincent François-Lavet'], 1)
287 | ]
288 | 
289 | # If true, show URL addresses after external links.
290 | #man_show_urls = False
291 | 
292 | 
293 | # -- Options for Texinfo output -------------------------------------------
294 | 
295 | # Grouping the document tree into Texinfo files. List of tuples
296 | # (source start file, target name, title, author,
297 | #  dir menu entry, description, category)
298 | texinfo_documents = [
299 |   ('index', 'deer', u'deer Documentation',
300 |    u'deer contributors', 'deer', 'One line description of project.',
301 |    'Miscellaneous'),
302 | ]
303 | 
304 | # Documents to append as an appendix to all manuals.
305 | #texinfo_appendices = []
306 | 
307 | # If false, no module index is generated.
308 | #texinfo_domain_indices = True
309 | 
310 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
311 | #texinfo_show_urls = 'footnote'
312 | 
313 | # If true, do not generate a @detailmenu in the "Top" node's menu.
314 | #texinfo_no_detailmenu = False
315 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | Welcome to DeeR's documentation!
  2 | ==================================
  3 | 
  4 | DeeR (Deep Reinforcement) is a python library to train an agent how to behave in a given environment so as to maximize a cumulative sum of rewards (see :ref:`what-is-deer`).
  5 | 
  6 | Here are key advantages of the library:
  7 | 
  8 | * You have access within a single library to techniques such as Double Q-learning, prioritized Experience Replay, Deep deterministic policy gradient (DDPG), Combined Reinforcement via Abstract Representations (CRAR), etc.
  9 | * This package provides a general framework where observations are made up of any number of elements (scalars, vectors or frames).
 10 | * You can easily add up a validation phase that allows to stop the training process before overfitting. This possibility is useful when the environment is dependent on scarce data (e.g. limited time series).
 11 | 
 12 | In addition, the framework is made in such a way that it is easy to 
 13 | 
 14 | * build any environment
 15 | * modify any part of the learning process
 16 | * use your favorite python-based framework to code your own learning algorithm or neural network architecture. The provided learning algorithms and neural network architectures are based on Keras.
 17 | 
 18 | .. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/schema_deer.png
 19 |    :scale: 50 %
 20 |    :alt: alternate text
 21 |    :align: right
 22 | 
 23 | :Figure: General schema of the different elements available in DeeR.
 24 | 
 25 | It is a work in progress and input is welcome. Please submit any contribution via pull request.
 26 | 
 27 | What is new
 28 | ------------
 29 | Version 0.4
 30 | ************
 31 | - Integration of CRAR that allows to combine the model-free and the model-based approaches via abstract representations.
 32 | - Augmented documentation and some interfaces have been updated.
 33 | 
 34 | Version 0.3 
 35 | ************
 36 | - Integration of different exploration/exploitation policies and possibility to easily built your own.
 37 | - Integration of DDPG for continuous action spaces (see :ref:`actor-critic`)
 38 | - :ref:`naming_conv` and some interfaces have been updated. This may cause broken backward compatibility. In that case, make the changes to the new convention by looking at the API in this documentation or by looking at the current version of the examples.
 39 | - Additional automated tests
 40 | 
 41 | Version 0.2
 42 | ***********
 43 | - Standalone python package (you can simply do ``pip install deer``)
 44 | - Integration of new examples environments : :ref:`toy_env_pendulum`, :ref:`PLE` and :ref:`gym`
 45 | - Double Q-learning and prioritized Experience Replay
 46 | - Augmented documentation
 47 | - First automated tests
 48 | 
 49 | Future extensions:
 50 | ******************
 51 | 
 52 | * Several agents interacting in the same environment
 53 | * ...
 54 | 
 55 | How should I cite DeeR?
 56 | ************************
 57 | 
 58 | Please cite DeeR in your publications if you use it in your research. Here is an example BibTeX entry:
 59 | 
 60 | .. code-block:: bash
 61 |     
 62 |     @misc{franccoislavet2016deer,
 63 |     title={DeeR},
 64 |     author={Fran\c{c}ois-Lavet, Vincent and others},
 65 |     year={2016},
 66 |     howpublished={\url{https://deer.readthedocs.io/}},
 67 |     }
 68 | 
 69 | 
 70 | User Guide
 71 | ------------
 72 | 
 73 | .. toctree::
 74 |   :maxdepth: 2
 75 |   
 76 |   user/installation
 77 |   user/tutorial
 78 |   user/environments
 79 |   user/development
 80 | 
 81 | API reference
 82 | -------------
 83 | 
 84 | If you are looking for information on a specific function, class or method, this API is for you.
 85 | 
 86 | .. toctree::
 87 |   :maxdepth: 2
 88 |   
 89 |   modules/agents
 90 |   modules/controllers
 91 |   modules/environments
 92 |   modules/learning-algorithms
 93 |   modules/policies
 94 |   
 95 | Indices and tables
 96 | ------------------
 97 | 
 98 | * :ref:`genindex`
 99 | * :ref:`modindex`
100 | * :ref:`search`
101 | 
102 | .. _GitHub: https://github.com/VinF/Deer
103 | 


--------------------------------------------------------------------------------
/docs/modules/agents.rst:
--------------------------------------------------------------------------------
 1 | .. _`agents`:
 2 | 
 3 | :mod:`Agent`
 4 | =============
 5 | 
 6 | .. automodule:: deer.agent
 7 | 
 8 | .. autosummary::
 9 | 
10 |     NeuralAgent
11 |     DataSet
12 | 
13 | .. autoclass:: NeuralAgent
14 |    :members:
15 | .. autoclass:: DataSet
16 |    :members:
17 | 


--------------------------------------------------------------------------------
/docs/modules/controllers.rst:
--------------------------------------------------------------------------------
 1 | .. _`controllers`:
 2 | 
 3 | :mod:`Controller`
 4 | ===================
 5 | 
 6 | .. automodule:: deer.experiment.base_controllers
 7 | 
 8 | 
 9 | .. autosummary::
10 |     
11 |     Controller
12 |     LearningRateController
13 |     EpsilonController
14 |     DiscountFactorController
15 |     TrainerController
16 |     InterleavedTestEpochController
17 |     FindBestController
18 | 
19 | .. autoclass:: Controller
20 |    :members:
21 | .. autoclass:: LearningRateController
22 |    :show-inheritance:
23 | .. autoclass:: EpsilonController
24 |    :show-inheritance:
25 | .. autoclass:: DiscountFactorController
26 |    :show-inheritance:
27 | .. autoclass:: TrainerController
28 |    :show-inheritance:
29 | .. autoclass:: InterleavedTestEpochController
30 |    :show-inheritance:
31 | .. autoclass:: FindBestController
32 |    :show-inheritance:
33 | 


--------------------------------------------------------------------------------
/docs/modules/environments.rst:
--------------------------------------------------------------------------------
 1 | .. _`env_interface`:
 2 | 
 3 | :mod:`Environment`
 4 | =============================
 5 | 
 6 | .. automodule:: deer.base_classes.environment
 7 | 
 8 | .. autoclass:: deer.base_classes.Environment
 9 |    :members:
10 | 
11 | 


--------------------------------------------------------------------------------
/docs/modules/learning-algorithms.rst:
--------------------------------------------------------------------------------
 1 | .. _learning-algorithms:
 2 | 
 3 | :mod:`Learning algorithms`
 4 | ==========================
 5 | 
 6 | .. autosummary::
 7 |     deer.base_classes.LearningAlgo
 8 |     deer.learning_algos.q_net_keras.MyQNetwork
 9 |     deer.learning_algos.AC_net_keras.MyACNetwork
10 |     deer.learning_algos.CRAR_keras.CRAR
11 | 
12 | .. autoclass:: deer.base_classes.LearningAlgo
13 |    :members:
14 | 
15 | .. autoclass:: deer.learning_algos.q_net_keras.MyQNetwork
16 |    :members:
17 | 
18 | .. autoclass:: deer.learning_algos.AC_net_keras.MyACNetwork
19 |    :members:
20 | 
21 | .. autoclass:: deer.learning_algos.CRAR_keras.CRAR
22 |    :members:
23 | 


--------------------------------------------------------------------------------
/docs/modules/policies.rst:
--------------------------------------------------------------------------------
 1 | .. _policies:
 2 | 
 3 | :mod:`Policies`
 4 | ==========================
 5 | 
 6 | .. autosummary::
 7 | 
 8 |     deer.base_classes.Policy
 9 |     deer.policies.EpsilonGreedyPolicy
10 |     deer.policies.LongerExplorationPolicy
11 | 
12 | .. autoclass:: deer.base_classes.Policy
13 |    :members:
14 | .. autoclass:: deer.policies.EpsilonGreedyPolicy
15 |    :members:
16 |    :show-inheritance:
17 | .. autoclass:: deer.policies.LongerExplorationPolicy
18 |    :members:
19 |    :show-inheritance:
20 | 


--------------------------------------------------------------------------------
/docs/user/development.rst:
--------------------------------------------------------------------------------
 1 | .. _dev:
 2 | 
 3 | Development
 4 | ===========
 5 | 
 6 | DeeR is a work in progress and contributions are welcome via pull request.
 7 | 
 8 | For more information, you can check out this link : |how_to_contrib|.
 9 | 
10 | .. |how_to_contrib| raw:: html
11 | 
12 |    <a href="https://guides.github.com/activities/contributing-to-open-source/#contributing" target="_blank">Contributing to an open source Project on github</a>
13 | 
14 | 
15 | You should also make sure that you install the repository approriately for development (see :ref:`dev-install`).
16 | 
17 | Guidelines for this project
18 | ---------------------------
19 | 
20 | Here are a few guidelines for this project.
21 | 
22 | * Simplicity: Be easy to use but also easy to understand when one digs into the code. Any additional code should be justified by the usefulness of the feature.
23 | * Modularity: The user should be able to easily use its own code with any part of the deer framework (probably at the exception of the core of agent.py that is coded in a very general way).
24 | 
25 | These guidelines come of course in addition to all good practices for open source development.
26 | 
27 | .. _naming_conv:
28 | 
29 | Naming convention for this project
30 | ----------------------------------
31 | 
32 | * All classes and methods have word boundaries using medial capitalization. Classes are written with UpperCamelCase and methods are written with lowerCamelCase respectively. Example: "two words" is rendered as "TwoWords" for the UpperCamelCase (classes) and "twoWords" for the lowerCamelCase (methods).
33 | * All attributes and variables have words separated by underscores. Example: "two words" is rendered as "two_words"
34 | * If a variable is intended to be 'private', it is prefixed by an underscore.
35 | 
36 | 


--------------------------------------------------------------------------------
/docs/user/environments.rst:
--------------------------------------------------------------------------------
 1 | .. _examples:
 2 | 
 3 | Examples
 4 | ========
 5 | 
 6 | You can find these examples at the |package_root|. For each example at least two files are provided:
 7 | 
 8 | * A launcher file (whose name usually starts by ``run_``). 
 9 | * An environnement file (whose name usually ends by ``_env``). 
10 | 
11 | 
12 | .. |package_root| raw:: html
13 | 
14 |    <a href="https://github.com/VinF/deer/tree/master/examples" target="_blank">root of the package</a>
15 | 
16 | 
17 | The launcher file performs different actions:
18 | 
19 | * It instantiates the environment and the agent along with a learning algorithm (such as a q-network).
20 | * It binds controllers to the agent
21 | * it finally runs the experiment
22 | 
23 | You can get started with the following examples:
24 | 
25 | .. toctree::
26 |   :maxdepth: 2
27 |   
28 |   environments/toy_env_time_series.rst
29 |   environments/gym.rst
30 |   environments/two_storages.rst
31 |   environments/planning.rst
32 |   environments/ALE.rst
33 |   


--------------------------------------------------------------------------------
/docs/user/environments/ALE.rst:
--------------------------------------------------------------------------------
 1 | .. _ale:
 2 | 
 3 | 
 4 | :mod:`ALE environment`
 5 | =======================
 6 | 
 7 | This environment is an interface with the |ALE_link| that simulates any ATARI game.
 8 | 
 9 | Related paper: Mnih, Volodymyr, et al. "Human-level control through deep reinforcement learning." Nature 518.7540 (2015): 529-533. (Hyper-parameters tuning is necessary if you want to try to replicate close performances.)
10 | 
11 | 
12 | .. |ALE_link| raw:: html
13 | 
14 |    <a href="http://www.arcadelearningenvironment.org" target="_blank">ALE environment</a>
15 |    


--------------------------------------------------------------------------------
/docs/user/environments/PLE.rst:
--------------------------------------------------------------------------------
 1 | .. _ple:
 2 | 
 3 | :mod:`PLE environment`
 4 | =======================
 5 | 
 6 | This environment is an interface with the |PLE_link|. The provided example shows how to successfully learn a good policy on the simple |catcher_link| in a few epochs (~10). You should easily be able to learn successful policies for all the games provided (possibly with some hyper-parameters tuning).
 7 | 
 8 | .. |PLE_link| raw:: html
 9 | 
10 |    <a href="http://pygame-learning-environment.readthedocs.org/en/latest/index.html" target="_blank">PLE environment</a>
11 |    
12 | .. |catcher_link| raw:: html
13 | 
14 |    <a href="http://pygame-learning-environment.readthedocs.io/en/latest/user/games/catcher.html" target="_blank">"catcher" game</a>
15 |    


--------------------------------------------------------------------------------
/docs/user/environments/gym.rst:
--------------------------------------------------------------------------------
 1 | .. _gym:
 2 | 
 3 | :mod:`Gym environment`
 4 | =======================
 5 | 
 6 | Some examples are also provided with the |Gym_link|.
 7 | 
 8 | .. |Gym_link| raw:: html
 9 | 
10 |    <a href="https://gym.openai.com" target="_blank">Gym environment</a>
11 |    
12 | Here is the resulting policy for the mountain car example:
13 |    
14 | .. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/gym_mountain_car.gif
15 |    :width: 500 px
16 |    :align: center
17 | 
18 | Here is the resulting policy for the pendulum example:
19 | 
20 | .. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/gym_pendulum.gif
21 |    :width: 500 px
22 |    :align: center
23 | 
24 | 


--------------------------------------------------------------------------------
/docs/user/environments/planning.rst:
--------------------------------------------------------------------------------
 1 | .. _planning:
 2 | 
 3 | 
 4 | :mod:`Tasks with planning`
 5 | =======================
 6 | 
 7 | You can find the following environments that demonstrate the possibilities of combining model-based and model-free: |CRAR_tests| and |CRAR_maze|.
 8 | 
 9 | .. |CRAR_tests| raw:: html
10 | 
11 |    <a href="https://github.com/VinF/deer/tree/master/examples/test_CRAR/" target="_blank">simples examples</a>
12 | 
13 | .. |CRAR_maze| raw:: html
14 | 
15 |    <a href="https://github.com/VinF/deer/tree/master/examples/maze/" target="_blank">how to solve any maze taken from a distribution</a>
16 | 


--------------------------------------------------------------------------------
/docs/user/environments/toy_env_time_series.rst:
--------------------------------------------------------------------------------
  1 | .. _toy_env_time_series:
  2 | 
  3 | :mod:`Toy environment with time series`
  4 | =======================================
  5 | 
  6 | Description of the environement
  7 | ###############################
  8 | 
  9 | This environment simulates the possibility of buying or selling a good. The agent can either have one unit or zero unit of that good. At each transaction with the market, the agent obtains a reward equivalent to the price of the good when selling it and the opposite when buying. In addition, a penalty of 0.5 (negative reward) is added for each transaction.
 10 | 
 11 | The price pattern is made by repeating the following signal plus a random constant between 0 and 3:
 12 | 
 13 | .. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/plot_toy_example_signal.png
 14 |    :width: 250 px
 15 |    :alt: Toy example price pattern
 16 |    :align: center
 17 | 
 18 | You can see how this environement is built by looking into the file ``Toy_env.py`` in |toy_env_folder|. It is important to note that any environment derives from the base class :ref:`env_interface` and you can refer to it in order to understand the required methods and their usage.
 19 | 
 20 | .. |toy_env_folder| raw:: html
 21 | 
 22 |    <a href="https://github.com/VinF/deer/tree/master/examples/toy_env" target="_blank">examples/toy_env/</a>
 23 | 
 24 | ..
 25 |     The price signal is built following the same rules for the training and the validation environments which allows the agent to learn a strategy that exploits this successfully.
 26 |     
 27 |     
 28 |     .. literalinclude:: ../../../examples/toy_env/Toy_env.py
 29 |        :language: python
 30 |        :lines: 21-75
 31 |     
 32 |     .. literalinclude:: ../../../examples/toy_env/Toy_env.py
 33 |        :language: python
 34 |        :lines: 116-130
 35 | 
 36 | 
 37 | How to run
 38 | ##########
 39 | 
 40 | A minimalist way of running this example can be found in the file ``run_toy_env_simple.py`` in |toy_env_folder|.
 41 | 
 42 | * First, we need to import the agent, the Q-network, the environement and some controllers
 43 | 
 44 | .. literalinclude:: ../../../examples/toy_env/run_toy_env_simple.py
 45 |    :language: python
 46 |    :lines: 6-11
 47 |    :linenos:
 48 | 
 49 | 
 50 | * Then we instantiate the different elements as follows:
 51 | 
 52 | .. literalinclude:: ../../../examples/toy_env/run_toy_env_simple.py
 53 |    :language: python
 54 |    :lines: 13-51
 55 |    :linenos:
 56 | 
 57 | 
 58 | Results
 59 | ########
 60 | 
 61 | Navigate to the folder ``examples/toy_env/`` in a terminal window. The example can then be run by using
 62 | 
 63 | .. code-block:: bash
 64 | 
 65 |     python run_toy_env_simple.py
 66 | 
 67 | You can also choose the full version of the launcher that specifies the hyperparameters for better performance.
 68 | 
 69 | .. code-block:: bash
 70 | 
 71 |     python run_toy_env.py
 72 | 
 73 | Every 10 epochs, a graph is saved in the 'toy_env' folder. You can then visualize the test policy at the end of the training:
 74 | 
 75 | .. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/plot_toy_example.png
 76 |    :width: 250 px
 77 |    :alt: Toy example policy
 78 |    :align: center
 79 | 
 80 | 
 81 | In this graph, you can see that the agent has successfully learned to take advantage of the price pattern to buy when it is low and to sell when it is high. This example is of course easy due to the fact that the patterns are very systematic which allows the agent to successfully learn it. It is important to note that the results shown are made on a validation set that is different from the training and we can see that learning generalizes well. For instance, the action of buying at time step 7 and 16 is the expected result because in average this will allow to make profit since the agent has no information on the future.
 82 | 
 83 | Using Convolutions VS LSTM's
 84 | ############################
 85 | 
 86 | So far, the neural network was build by using a convolutional architecture as follows:
 87 | 
 88 | .. image:: http://vincent.francois-l.be/img_GeneralDeepQRL/Convolutions_architecture.png
 89 |    :width: 350 px
 90 |    :alt: Convolutions architecture
 91 |    :align: center
 92 | 
 93 | The neural nework processes time series thanks to a set of convolutions layers. The output of the convolutions as well as the other inputs are followed by fully connected layers and the ouput layer.
 94 | 
 95 | 
 96 | When working with deep reinforcement learning, it is also possible to work with LSTM's (see for instance this |introduction-LSTM|)
 97 | 
 98 | .. |introduction-LSTM| raw:: html
 99 | 
100 |    <a href="http://colah.github.io/posts/2015-08-Understanding-LSTMs/" target="_blank">introduction to LSTM's</a>
101 | 
102 | If you want to use LSTM's architecture, you can import the following libraries
103 | 
104 | .. code-block:: bash
105 | 
106 |     from deer.learning_algos.NN_keras_LSTM import NN as NN_keras
107 | 
108 | and then instanciate the qnetwork by specifying the 'neural_network' as follows:
109 | 
110 | .. code-block:: bash
111 | 
112 |      qnetwork = MyQNetwork(
113 |          env,
114 |          neural_network=NN_keras)
115 | 


--------------------------------------------------------------------------------
/docs/user/environments/two_storages.rst:
--------------------------------------------------------------------------------
  1 | .. _two_storages:
  2 | 
  3 | :mod:`Two storage devices environment`
  4 | ========================================
  5 | 
  6 | Description of the environement
  7 | ###############################
  8 | 
  9 | This example simulates the operation of a realistic micro-grid (such as a smart home for instance) that is not connected to the main utility grid (off-grid) and that is provided with PV panels, batteries and hydrogen storage. The battery has the advantage that it is not limited in instaneous power that it can provide or store. The hydrogen storage has the advantage that is can store very large quantity of energy.
 10 | 
 11 | .. code-block:: bash
 12 | 
 13 |     python run_MG_two_storage_devices
 14 | 
 15 | 
 16 | This example uses the environment defined in MG_two_storage_devices_env.py. The agent can either choose to store in the long term storage or take energy out of it while the short term storage handle at best the lack or surplus of energy by discharging itself or charging itself respectively. Whenever the short term storage is empty and cannot handle the net demand a penalty (negative reward) is obtained equal to the value of loss load set to 2euro/kWh.
 17 | 
 18 | The state of the agent is made up of an history of two to four punctual observations:
 19 | 
 20 | * Charging state of the short term storage (0 is empty, 1 is full)
 21 | * Production and consumption (0 is no production or consumption, 1 is maximal production or consumption)
 22 | * (Distance to equinox)
 23 | * (Predictions of future production : average of the production for the next 24 hours and 48 hours)
 24 | 
 25 | Two actions are possible for the agent:
 26 | 
 27 | * Action 0 corresponds to discharging the long-term storage
 28 | * Action 1 corresponds to charging the long-term storage
 29 | 
 30 | More information can be found in 
 31 |     `Deep Reinforcement Learning Solutions for Energy Microgrids Management`_, Vincent François-Lavet, David Taralla, Damien Ernst, Raphael Fonteneau
 32 | 
 33 | .. _Deep Reinforcement Learning Solutions for Energy Microgrids Management: https://ewrl.files.wordpress.com/2016/11/ewrl13-2016-submission_21.pdf
 34 | 
 35 | Annex to the paper
 36 | ##################
 37 | 
 38 | ..
 39 |     Neural network architecture
 40 |     ***************************
 41 |     
 42 |     We propose a neural network architecture where the inputs are provided by the state vector, and where each separate output represents the Q-value function for one of the discretized actions. The action :math:`a_t` to be made at time :math:`t` is whether to charge or discharge the hydrogen storage device with the assumption that the batteries handle at best the current demand (avoid any value of loss load whenever possible). We consider three discretized actions : (i) discharge at full rate the hydrogen storage, (ii) keep it idle or (iii) charge it at full rate.
 43 |     
 44 |     The neural network process time series thanks to a set of convolutions that convolves 16 filters of :math:`2 \times 1` with stride 1 followed by a convolution with 16 filters of :math:`2 \times 2` with stride 1. The output of the convolutions as well as the other inputs are then followed by two fully connected layers with 50 and 20 neurons and the ouput layer. The activation function used is the Rectified Linear Unit (ReLU) except for the output layer where no activation function is used. 
 45 |     
 46 |     .. figure:: http://vincent.francois-l.be/img_GeneralDeepQRL/Convolutions_architecture.png
 47 |        :width: 400 px
 48 |        :align: center
 49 |     
 50 |        Sketch of the structure of the neural network architecture (without representing the actual number of neurons in each layer). The neural network processes time series thanks to a set of convolutions layers. The output of the convolutions as well as the other inputs are followed by fully connected layers and the ouput layer.
 51 | 
 52 | 
 53 | PV production and consumption profiles
 54 | **************************************
 55 | Solar irradiance varies throughout the year depending on the seasons, and it also varies throughout the day depending on the weather and the position of the sun in the sky relative to the PV panels. The main distinction between these profiles is the difference between summer and winter PV production. In particular, production varies with a factor 1:5 between winter and summer as can be seen from the measurements of PV panels production for a residential customer located in Belgium in the figures below. 
 56 | 
 57 | .. figure:: http://vincent.francois-l.be/img_GeneralDeepQRL/ProductionVSMonths_be.png
 58 |    :width: 300 px
 59 |    :align: center
 60 |    
 61 |    Total energy produced per month
 62 | 
 63 | .. figure:: http://vincent.francois-l.be/img_GeneralDeepQRL/ProductionVSTime_1janv_be.png
 64 |    :width: 300 px
 65 |    :align: center
 66 |    
 67 |    Typical production in winter
 68 | 
 69 | .. figure:: http://vincent.francois-l.be/img_GeneralDeepQRL/ProductionVSTime_1july_be.png
 70 |    :width: 300 px
 71 |    :align: center
 72 | 
 73 |    Typical production in summer
 74 |    
 75 | 
 76 | A simple residential consumption profile is considered with a daily average consumption of 18kWh (see figure below). 
 77 | 
 78 | .. figure:: http://vincent.francois-l.be/img_GeneralDeepQRL/ConsumptionVSTime_random.png
 79 |    :width: 300 px
 80 |    :align: center
 81 | 
 82 |    Representative residential consumption profile
 83 | 
 84 | 
 85 | 
 86 | Main microgrid parameters
 87 | **************************
 88 | 
 89 | .. list-table:: Data used for the PV panels
 90 |    :widths: 30 10 20
 91 | 
 92 |    * - cost
 93 |      - :math:`c^{PV}`
 94 |      - :math:`1 euro/W_p`
 95 |    * - Efficiency
 96 |      - :math:`\eta^{PV}`
 97 |      - :math:`18 \%`
 98 |    * - Life time
 99 |      - :math:`L^{PV}`
100 |      - :math:`20 years`
101 | 
102 | .. list-table:: Data used for the :math:`LiFePO_4` battery
103 |    :widths: 30 10 20
104 | 
105 |    * - cost
106 |      - :math:`c^B`
107 |      - :math:`500 euro/kWh`
108 |    * - discharge efficiency
109 |      - :math:`\eta_0^B`
110 |      - :math:`90\%`
111 |    * - charge efficiency
112 |      - :math:`\zeta_0^B`
113 |      - :math:`90\%`
114 |    * - Maximum instantaneous power
115 |      - :math:`P^B`
116 |      - :math:`> 10kW`
117 |    * - Life time
118 |      - :math:`L^{B}`
119 |      - :math:`20 years`
120 | 
121 | .. list-table:: Data used for the Hydrogen storage device
122 |    :widths: 30 10 20
123 | 
124 |    * - cost
125 |      - :math:`c^{H_2}`
126 |      - :math:`14 euro/W_p`
127 |    * - discharge efficiency
128 |      - :math:`\eta_0^{H_2}`
129 |      - :math:`65\%`
130 |    * - charge efficiency
131 |      - :math:`\zeta_0^{H_2}`
132 |      - :math:`65\%`
133 |    * - Life time
134 |      - :math:`L^{H_2}`
135 |      - :math:`20 years`
136 |      
137 | .. list-table:: Data used for reward function
138 |    :widths: 30 10 20
139 | 
140 |    * - cost endured per kWh not supplied within the microgrid
141 |      - :math:`k`
142 |      - :math:`2 euro/kWh`
143 |    * - revenue/cost per kWh of hydrogen produced/used
144 |      - :math:`k^{H_2}` 
145 |      - :math:`0.1 euro/kWh`
146 | 


--------------------------------------------------------------------------------
/docs/user/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _installation:
 2 | 
 3 | Installation
 4 | ==============
 5 | 
 6 | 
 7 | Dependencies
 8 | --------------
 9 | 
10 | This framework is tested to work under Python 3.6.
11 | 
12 | The required dependencies are NumPy >= 1.10, joblib >= 0.9. You also need keras or you can write your own learning algorithms using your favorite deep learning framework.
13 | 
14 | For running some of the examples, Matplotlib >= 1.1.1 is required. You also sometimes need to install specific dependencies (e.g. for the atari games, you need to install ALE >= 0.4).
15 | 
16 | 
17 | We recommend to use the bleeding-edge version and to install it by following the :ref:`dev-install`. If you want a simpler installation procedure and do not intend to modify yourself the learning algorithms etc., you can look at the :ref:`user-install`. 
18 | 
19 | .. _dev-install:
20 | 
21 | Developer install instructions
22 | -------------------------------
23 | 
24 | As a developer, you can set you up with the bleeding-edge version of DeeR with: 
25 | 
26 | .. code-block:: bash
27 | 
28 |     git clone -b master https://github.com/VinF/deer.git
29 | 
30 | Assuming you already have a python environment with ``pip``, you can automatically install all the dependencies (except specific dependencies that you may need for some examples) with:
31 | 
32 | .. code-block:: bash
33 |     
34 |     pip install -r requirements.txt
35 | 
36 | 
37 | And you can install the framework as a package using the mode ``develop`` so that you can make modifications and test without having to re-install the package.
38 | 
39 | .. code-block:: bash
40 |     
41 |     python setup.py develop
42 | 
43 | 
44 | .. _user-install:
45 | 
46 | User install instructions
47 | --------------------------
48 | 
49 | You can install the framework with pip:
50 | 
51 | .. code-block:: bash
52 |     
53 |     pip install deer
54 | 
55 | For the bleeding edge version (recommended), you can simply use
56 | 
57 | .. code-block:: bash
58 | 
59 |     pip install git+git://github.com/VINF/deer.git@master
60 | 
61 |     
62 | ..
63 |     If you want to update it to the bleeding edge version you can use pip for this with the command line below:
64 |  
65 |     .. code-block:: bash
66 |     
67 |         pip install --upgrade --no-deps git+git://github.com/VinF/deer
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/docs/user/tutorial.rst:
--------------------------------------------------------------------------------
 1 | Tutorial
 2 | =========
 3 | 
 4 | .. _what-is-deer:
 5 | 
 6 | What is deep reinforcement learning?
 7 | ------------------------------------
 8 | 
 9 | Deep reinforcement learning is the combination of two fields:
10 | 
11 | * *Reinforcement learning (RL)* is a theory that allows an agent to learn a startegy so as to maximize a sum of cumulated (delayed) rewards from any given environment. If you are not familiar with RL, you can get up to speed easily with the |SB_link|.
12 | 
13 | .. |SB_link| raw:: html
14 | 
15 |    <a href="https://webdocs.cs.ualberta.ca/~sutton/book/the-book.html" target="_blank">book by Sutton and Barto</a>
16 |    
17 |    
18 | * *Deep learning* is a branch of machine learning for regression and classification. It is particularly well suited to model high-level abstractions in data by using multiple processing layers composed of multiple non-linear transformations.
19 | 
20 | This combination allows to learn complex tasks such as playing ATARI games from high-dimensional sensory inputs. For more information, you can refer to this |intro-deep-RL|.
21 | 
22 | .. |intro-deep-RL| raw:: html
23 | 
24 |    <a href="https://arxiv.org/abs/1811.12560" target="_blank">introduction to deep reinforcement learning</a>
25 | 
26 | ..
27 |     How does it work?
28 |     -------------------
29 | 
30 |     In RL, there are two main parts:
31 | 
32 |     * An agent with learning capabilities.
33 |     * An environment. 
34 | 
35 |     The environment defines the task to be performed by the agent with the following elements:
36 | 
37 |     * a set of environment states S
38 |     * a set of actions A
39 |     * a dynamics of the system, i.e. rules of transitioning between states
40 |     * a reward function, i.e rules that determine the immediate reward (scalar) of a transition
41 |     * a set of obsevrations O, that may be the same than S (MDP case) or different (POMDP case)
42 | 
43 | 
44 | How can I get started?
45 | -----------------------
46 | 
47 | First, make sure you have installed the package properly by following the steps described in :ref:`installation`.
48 | 
49 | The general idea of this framework is that you need to instantiate an agent (along with a learning algorithm) and an environment. In order to perform an experiment, you also need to attach to the agent some controllers for controlling the training and the various parameters of your agent.
50 | 
51 | The environment should be built specifically for any specific task while learning algorithms (such as q-networks) and many controllers are provided within this package. 
52 | 
53 | The best to get started is to have a look at the :ref:`examples` and in particular the two first environments that are simple to understand: 
54 | 
55 | * :ref:`toy_env_time_series`
56 | * :ref:`gym`
57 | 
58 | If you find something that is not yet implemented and if you wish to contribute, you can check the section :ref:`dev`.
59 | 
60 | ..
61 |     From there, you can look at this documentation for more informations on the controllers and the other environments. 
62 | 
63 | Any Question?
64 | -------------
65 | 
66 | .. |Google_group| raw:: html
67 | 
68 |    <a href="https://groups.google.com/forum/#!forum/deer-library" target="_blank">https://groups.google.com/forum/#!forum/deer-library</a>
69 | 
70 | .. |Deer_issues| raw:: html
71 | 
72 |    <a href="https://github.com/VinF/deer/issues" target="_blank">https://github.com/VinF/deer/issues</a>
73 | 
74 | You can raise questions about the DeeR project on github : |Deer_issues|
75 | 
76 | 


--------------------------------------------------------------------------------
/examples/ALE/ALE_env.py:
--------------------------------------------------------------------------------
  1 | """ Interface with the ALE environment
  2 | 
  3 | """
  4 | 
  5 | import numpy as np
  6 | import cv2
  7 | from ale_python_interface import ALEInterface
  8 | from deer.base_classes import Environment
  9 | 
 10 | from mpl_toolkits.axes_grid1 import host_subplot
 11 | import mpl_toolkits.axisartist as AA
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | class MyEnv(Environment):
 15 |     VALIDATION_MODE = 0
 16 | 
 17 |     def __init__(self, rng, rom="ale/breakout.bin", frame_skip=4, 
 18 |                  ale_options=[{"key": "random_seed", "value": 0}, 
 19 |                               {"key": "color_averaging", "value": True},
 20 |                               {"key": "repeat_action_probability", "value": 0.}]):
 21 |         self._mode = -1
 22 |         self._mode_score = 0.0
 23 |         self._mode_episode_count = 0
 24 | 
 25 |         self._frame_skip = frame_skip if frame_skip >= 1 else 1
 26 |         self._random_state = rng
 27 | 
 28 |         self._ale = ALEInterface()
 29 |         for option in ale_options:
 30 |             t = type(option["value"])
 31 |             if t is int:
 32 |                 self._ale.setInt(option["key"], option["value"])
 33 |             elif t is float:
 34 |                 self._ale.setFloat(option["key"], option["value"])
 35 |             elif t is bool:
 36 |                 self._ale.setBool(option["key"], option["value"])
 37 |             else:
 38 |                 raise ValueError("Option {} ({}) is not an int, bool or float.".format(option["key"], t))
 39 |         self._ale.loadROM(rom)
 40 | 
 41 |         w, h = self._ale.getScreenDims()
 42 |         self._screen = np.empty((h, w), dtype=np.uint8)
 43 |         self._reduced_screen = np.empty((84, 84), dtype=np.uint8)
 44 |         self._actions = self._ale.getMinimalActionSet()
 45 | 
 46 |                 
 47 |     def reset(self, mode):
 48 |         if mode == MyEnv.VALIDATION_MODE:
 49 |             if self._mode != MyEnv.VALIDATION_MODE:
 50 |                 self._mode = MyEnv.VALIDATION_MODE
 51 |                 self._mode_score = 0.0
 52 |                 self._mode_episode_count = 0
 53 |             else:
 54 |                 self._mode_episode_count += 1
 55 |         elif self._mode != -1: # and thus mode == -1
 56 |             self._mode = -1
 57 | 
 58 |         self._ale.reset_game()
 59 |         for _ in range(self._random_state.randint(15)):
 60 |             self._ale.act(0)
 61 |         self._ale.getScreenGrayscale(self._screen)
 62 |         cv2.resize(self._screen, (84, 84), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
 63 |         
 64 |         return [4 * [84 * [84 * [0]]]]
 65 |         
 66 |         
 67 |     def act(self, action):
 68 |         action = self._actions[action]
 69 |         
 70 |         reward = self._ale.act(action)
 71 |         #if self.inTerminalState():
 72 |         #    break
 73 |             
 74 |         self._ale.getScreenGrayscale(self._screen)
 75 |         cv2.resize(self._screen, (84, 84), self._reduced_screen, interpolation=cv2.INTER_NEAREST)
 76 |   
 77 |         self._mode_score += reward
 78 |         return np.sign(reward)
 79 | 
 80 |     def summarizePerformance(self, test_data_set, *args, **kwargs):
 81 |         if self.inTerminalState() == False:
 82 |             self._mode_episode_count += 1
 83 |         print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))
 84 | 
 85 | 
 86 |     def inputDimensions(self):
 87 |         return [(4, 84, 84)]
 88 | 
 89 |     def observationType(self, subject):
 90 |         return np.uint8
 91 | 
 92 |     def nActions(self):
 93 |         return len(self._actions)
 94 | 
 95 |     def observe(self):
 96 |         return [np.array(self._reduced_screen)]
 97 | 
 98 |     def inTerminalState(self):
 99 |         return self._ale.game_over()
100 |                 
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     pass


--------------------------------------------------------------------------------
/examples/ALE/ALE_env_gym.py:
--------------------------------------------------------------------------------
  1 | """ Interface with the ALE environment
  2 | 
  3 | Authors: Vincent Francois-Lavet
  4 | """
  5 | import numpy as np
  6 | np.set_printoptions(threshold=np.nan)
  7 | import cv2
  8 | #from ale_python_interface import ALEInterface
  9 | import gym
 10 | from deer.base_classes import Environment
 11 | 
 12 | #import matplotlib
 13 | #matplotlib.use('qt5agg')
 14 | #from mpl_toolkits.axes_grid1 import host_subplot
 15 | #import mpl_toolkits.axisartist as AA
 16 | #import matplotlib.pyplot as plt
 17 | #from PIL import Image
 18 |     
 19 | class MyEnv(Environment):
 20 |     VALIDATION_MODE = 0
 21 | 
 22 |     def __init__(self, rng, **kwargs):
 23 |         """ Initialize environment.
 24 | 
 25 |         Arguments:
 26 |             rng - the numpy random number generator            
 27 |         """
 28 |         if(bool(kwargs["game"])):
 29 |             self.env = gym.make(kwargs["game"])
 30 |         else:
 31 |             # Choice between Seaquest-v4, Breakout-v4, SpaceInvaders-v4, BeamRider-v4, Qbert-v4, Freeway-v4', etc.
 32 |             self.env = gym.make('Seaquest-v4')
 33 |         self._random_state=rng
 34 |         self.env.reset()
 35 |         frame_skip=kwargs.get('frame_skip',1)
 36 |         self._frame_skip = frame_skip if frame_skip >= 1 else 1
 37 |         
 38 |         self._screen=np.average(self.env.render(mode='rgb_array'),axis=-1)
 39 |         self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_LINEAR) 
 40 |             #decide whether you want to keep this in repo, if so: add dependency to cv2
 41 |         #plt.imshow(self._reduced_screen, cmap='gray')
 42 |         #plt.show()
 43 |         
 44 |         self._mode = -1
 45 |         self._mode_score = 0.0
 46 |         self._mode_episode_count = 0
 47 | 
 48 | 
 49 |                 
 50 |     def reset(self, mode):
 51 |         if mode == self._mode:
 52 |             # already in the right mode
 53 |             self._mode_episode_count += 1
 54 |         else:
 55 |             # switching mode
 56 |             self._mode = mode
 57 |             self._mode_score = 0.0
 58 |             self._mode_episode_count = 0
 59 | 
 60 |         self.env.reset()
 61 |         for _ in range(self._random_state.randint(15)):
 62 |             action = self.env.action_space.sample()
 63 | 
 64 |             # this executes the environment with an action,
 65 |             # and returns the observation of the environment,
 66 |             # the reward, if the env is over, and other info.
 67 |             observation, reward, self.terminal, info = self.env.step(action)
 68 | 
 69 |         self._screen=np.average(self.env.render(mode='rgb_array'),axis=-1)
 70 |         self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_LINEAR) 
 71 |         self.state=np.zeros((84,84), dtype=np.uint8) #FIXME
 72 |         
 73 |         return [1*[4 * [84 * [84 * [0]]]]]
 74 |         
 75 |         
 76 |     def act(self, action):
 77 |         #print "action"
 78 |         #print action
 79 |         
 80 |         self.state=np.zeros((4,84,84), dtype=np.float)
 81 |         reward=0
 82 |         for t in range(4):
 83 |             observation, r, self.terminal, info = self.env.step(action)
 84 |             #print "observation, reward, self.terminal"
 85 |             #print observation, reward, self.terminal
 86 |             reward+=r
 87 |             if self.inTerminalState():
 88 |                 break
 89 | 
 90 |             self._screen=np.average(observation,axis=-1) # Gray levels
 91 |             self._reduced_screen = cv2.resize(self._screen, (84, 84), interpolation=cv2.INTER_NEAREST)  # 84*84
 92 |             #plt.imshow(self._screen, cmap='gray')
 93 |             #plt.show()
 94 |             self.state[t,:,:]=self._reduced_screen
 95 |             
 96 |         self._mode_score += reward
 97 |         return np.sign(reward)
 98 | 
 99 |     def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
100 |         if self.inTerminalState() == False:
101 |             self._mode_episode_count += 1
102 |         print("== Mean score per episode is {} over {} episodes ==".format(self._mode_score / self._mode_episode_count, self._mode_episode_count))
103 | 
104 | 
105 |     def inputDimensions(self):
106 |         return [(1, 4, 84, 84)]
107 | 
108 |     def observationType(self, subject):
109 |         return np.float16
110 | 
111 |     def nActions(self):
112 |         print ("self.env.action_space")
113 |         print (self.env.action_space)
114 |         return self.env.action_space.n
115 | 
116 |     def observe(self):
117 |         return [(np.array(self.state)-128.)/128.]
118 | 
119 |     def inTerminalState(self):
120 |         return self.terminal
121 |                 
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     pass


--------------------------------------------------------------------------------
/examples/ALE/run_ALE.py:
--------------------------------------------------------------------------------
  1 | """ALE launcher. See Wiki for more details about this experiment.
  2 | 
  3 | """
  4 | 
  5 | import sys
  6 | import logging
  7 | import numpy as np
  8 | from joblib import hash, dump,load
  9 | import os
 10 | 
 11 | from deer.default_parser import process_args
 12 | from deer.agent import NeuralAgent
 13 | from deer.learning_algos.q_net_keras import MyQNetwork
 14 | from ALE_env_gym import MyEnv as ALE_env
 15 | import deer.experiment.base_controllers as bc
 16 | 
 17 | from deer.policies import EpsilonGreedyPolicy
 18 | 
 19 | class Defaults:
 20 |     # ----------------------
 21 |     # Experiment Parameters
 22 |     # ----------------------
 23 |     STEPS_PER_EPOCH = 10000#250000
 24 |     EPOCHS = 500#40
 25 |     STEPS_PER_TEST = 2000#125000
 26 |     PERIOD_BTW_SUMMARY_PERFS = 1
 27 |     
 28 |     # ----------------------
 29 |     # Environment Parameters
 30 |     # ----------------------
 31 |     FRAME_SKIP = 4
 32 | 
 33 |     # ----------------------
 34 |     # DQN Agent parameters:
 35 |     # ----------------------
 36 |     UPDATE_RULE = 'rmsprop'
 37 |     LEARNING_RATE = 0.0002
 38 |     LEARNING_RATE_DECAY = 1.#0.99
 39 |     DISCOUNT = 0.95
 40 |     DISCOUNT_INC = 0.99
 41 |     DISCOUNT_MAX = 0.99
 42 |     RMS_DECAY = 0.9
 43 |     RMS_EPSILON = 0.0001
 44 |     MOMENTUM = 0
 45 |     CLIP_NORM = 1.0
 46 |     EPSILON_START = 1.0
 47 |     EPSILON_MIN = .1
 48 |     EPSILON_DECAY = 100000
 49 |     UPDATE_FREQUENCY = 1
 50 |     REPLAY_MEMORY_SIZE = 250000 #Each element is 4 frames --> 10^6 frames
 51 |     BATCH_SIZE = 32
 52 |     FREEZE_INTERVAL = 2500
 53 |     DETERMINISTIC = True
 54 | 
 55 | 
 56 | if __name__ == "__main__":
 57 |     logging.basicConfig(level=logging.INFO)
 58 |     
 59 |     # --- Parse parameters ---
 60 |     parameters = process_args(sys.argv[1:], Defaults)
 61 |     if parameters.deterministic:
 62 |         rng = np.random.RandomState(123456)
 63 |     else:
 64 |         rng = np.random.RandomState()
 65 |     
 66 |     # --- Instantiate environment ---
 67 |     #env = ALE_env(rng, frame_skip=parameters.frame_skip, 
 68 |     #            ale_options=[{"key": "random_seed", "value": rng.randint(9999)}, 
 69 |     #                         {"key": "color_averaging", "value": True},
 70 |     #                         {"key": "repeat_action_probability", "value": 0.}])
 71 |     
 72 |     env = ALE_env(rng, game=parameters.param1, frame_skip=parameters.frame_skip)
 73 |     
 74 |     # --- Instantiate qnetwork ---
 75 |     qnetwork = MyQNetwork(
 76 |         env,
 77 |         parameters.rms_decay,
 78 |         parameters.rms_epsilon,
 79 |         parameters.momentum,
 80 |         parameters.clip_norm,
 81 |         parameters.freeze_interval,
 82 |         parameters.batch_size,
 83 |         parameters.update_rule,
 84 |         rng,     
 85 |         double_Q=True)
 86 |     
 87 |     train_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 1.)
 88 |     test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.05)
 89 | 
 90 |     # --- Instantiate agent ---
 91 |     agent = NeuralAgent(
 92 |         env,
 93 |         qnetwork,
 94 |         parameters.replay_memory_size,
 95 |         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
 96 |         parameters.batch_size,
 97 |         rng,
 98 |         train_policy=train_policy,
 99 |         test_policy=test_policy)
100 | 
101 |     # --- Create unique filename for FindBestController ---
102 |     h = hash(vars(parameters), hash_name="sha1")
103 |     fname = "ALE_" + h
104 |     print("The parameters hash is: {}".format(h))
105 |     print("The parameters are: {}".format(parameters))
106 | 
107 |     # --- Bind controllers to the agent ---
108 |     # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
109 |     # learning rate as well as the training epoch number.
110 |     agent.attach(bc.VerboseController(
111 |         evaluate_on='epoch', 
112 |         periodicity=1))
113 |     
114 |     # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
115 |     # Plus, we also want to display after each training episode (!= than after every training) the average bellman
116 |     # residual and the average of the V values obtained during the last episode, hence the two last arguments.
117 |     agent.attach(bc.TrainerController(
118 |         evaluate_on='action', 
119 |         periodicity=parameters.update_frequency, 
120 |         show_episode_avg_V_value=True, 
121 |         show_avg_Bellman_residual=True))
122 |     
123 |     # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
124 |     # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
125 |     agent.attach(bc.LearningRateController(
126 |         initial_learning_rate=parameters.learning_rate, 
127 |         learning_rate_decay=parameters.learning_rate_decay,
128 |         periodicity=1))
129 |     
130 |     # Same for the discount factor.
131 |     agent.attach(bc.DiscountFactorController(
132 |         initial_discount_factor=parameters.discount, 
133 |         discount_factor_growth=parameters.discount_inc, 
134 |         discount_factor_max=parameters.discount_max,
135 |         periodicity=1))
136 |     
137 |     # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
138 |     # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
139 |     # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
140 |     # episode or epoch (or never, hence the resetEvery='none').
141 |     agent.attach(bc.EpsilonController(
142 |         initial_e=parameters.epsilon_start, 
143 |         e_decays=parameters.epsilon_decay, 
144 |         e_min=parameters.epsilon_min,
145 |         evaluate_on='action',
146 |         periodicity=1,
147 |         reset_every='none'))
148 |     
149 |     # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
150 |     # seems to generalize the better, thus which one has the highest validation score. Here, we do not care about the
151 |     # "true generalization score", or "test score".
152 |     # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
153 |     # important that the validationID is the same than the id argument of the InterleavedTestEpochController.
154 |     # The FindBestController will dump on disk the validation scores for each and every network, as well as the 
155 |     # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
156 |     # of the validation and test scores (see below) or simply recover the resulting neural network for your 
157 |     # application.
158 | #    agent.attach(bc.FindBestController(
159 | #        validationID=ALE_env.VALIDATION_MODE,
160 | #        testID=None,
161 | #        unique_fname=fname))
162 |     
163 |     # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
164 |     # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want 
165 |     # these validation epoch to interfere with the training of the agent, which is well established by the 
166 |     # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole 
167 |     # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the 
168 |     # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards 
169 |     # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every 
170 |     # [parameters.period_btw_summary_perfs] *validation* epochs.
171 |     agent.attach(bc.InterleavedTestEpochController(
172 |         id=ALE_env.VALIDATION_MODE, 
173 |         epoch_length=parameters.steps_per_test,
174 |         periodicity=1,
175 |         show_score=True,
176 |         summarize_every=1))
177 | 
178 |     agent.attach(bc.InterleavedTestEpochController(
179 |         id=ALE_env.VALIDATION_MODE+1, 
180 |         epoch_length=parameters.steps_per_test,
181 |         periodicity=1,
182 |         show_score=True,
183 |         summarize_every=1))
184 | 
185 |     agent.attach(bc.InterleavedTestEpochController(
186 |         id=ALE_env.VALIDATION_MODE+2, 
187 |         epoch_length=parameters.steps_per_test,
188 |         periodicity=1,
189 |         show_score=True,
190 |         summarize_every=1))
191 |     
192 |     agent.attach(bc.InterleavedTestEpochController(
193 |         id=ALE_env.VALIDATION_MODE+3, 
194 |         epoch_length=parameters.steps_per_test,
195 |         periodicity=1,
196 |         show_score=True,
197 |         summarize_every=1))
198 | 
199 | 
200 |     # --- Run the experiment ---
201 |     try:
202 |         os.mkdir("params")
203 |     except Exception:
204 |         pass
205 |     dump(vars(parameters), "params/" + fname + ".jldump")
206 |     agent.run(parameters.epochs, parameters.steps_per_epoch)
207 |     
208 |     # --- Show results ---
209 |     basename = "scores/" + fname
210 |     scores = load(basename + "_scores.jldump")
211 |     plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
212 |     plt.legend()
213 |     plt.xlabel("Number of epochs")
214 |     plt.ylabel("Score")
215 |     plt.savefig(basename + "_scores.pdf")
216 |     plt.show()
217 | 


--------------------------------------------------------------------------------
/examples/MG_two_storages/data/BelgiumPV_prod_test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/examples/MG_two_storages/data/BelgiumPV_prod_test.npy


--------------------------------------------------------------------------------
/examples/MG_two_storages/data/BelgiumPV_prod_train.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/examples/MG_two_storages/data/BelgiumPV_prod_train.npy


--------------------------------------------------------------------------------
/examples/MG_two_storages/data/example_nondeterminist_cons_test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/examples/MG_two_storages/data/example_nondeterminist_cons_test.npy


--------------------------------------------------------------------------------
/examples/MG_two_storages/data/example_nondeterminist_cons_train.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/examples/MG_two_storages/data/example_nondeterminist_cons_train.npy


--------------------------------------------------------------------------------
/examples/MG_two_storages/data/spotmarket_data_2007-2013.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VinF/deer/4ff0c4a357f78b19ba4d240a0be1e37ca7ec5077/examples/MG_two_storages/data/spotmarket_data_2007-2013.xls


--------------------------------------------------------------------------------
/examples/MG_two_storages/plot_MG_operation.py:
--------------------------------------------------------------------------------
 1 | from mpl_toolkits.axes_grid1 import host_subplot
 2 | import mpl_toolkits.axisartist as AA
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | 
 7 | def plot_op(actions, consumption,production,rewards,battery_level, plot_name):
 8 |     ####
 9 |     # PLOT
10 |     ####
11 |     
12 |     print ( "In this plot, total score"+str(np.sum(rewards)) )
13 |     print ( "H:"+str( np.sum(actions)*0.1 ) )
14 |     print ( "-:"+str( np.sum(rewards)-np.sum(actions)*0.1 ) )
15 |     
16 |     print ("battery_level")
17 |     print (battery_level)
18 |     print ("actions")
19 |     print (actions)
20 |     print ("consumption")
21 |     print (consumption)
22 |     print ("production")
23 |     print (production)
24 |     print ("rewards")
25 |     print (rewards)
26 |     
27 |     steps=np.arange(100)
28 |     print (steps)
29 |     print ("battery_level")
30 |     print (battery_level)
31 |     print (consumption)
32 |     print (production)
33 |     print ("demand:")
34 |     print (consumption-production)
35 |     
36 |     steps_long=np.arange(1000)/10.
37 |     
38 |     host = host_subplot(111, axes_class=AA.Axes)
39 |     plt.subplots_adjust(left=0.2, right=0.8)
40 |     
41 |     par1 = host.twinx()
42 |     par2 = host.twinx()
43 |     par3 = host.twinx()
44 |     
45 |     offset = 60
46 |     new_fixed_axis = par2.get_grid_helper().new_fixed_axis
47 |     par2.axis["right"] = new_fixed_axis(loc="right",
48 |                                         axes=par2,
49 |                                         offset=(offset, 0))    
50 |     par2.axis["right"].toggle(all=True)
51 |     
52 |     offset = -60
53 |     new_fixed_axis = par3.get_grid_helper().new_fixed_axis
54 |     par3.axis["right"] = new_fixed_axis(loc="left",
55 |                                         axes=par3,
56 |                                         offset=(offset, 0))    
57 |     par3.axis["right"].toggle(all=True)
58 |     
59 |     
60 |     host.set_xlim(-0.9, 99)
61 |     host.set_ylim(0, 20.9)
62 |     
63 |     host.set_xlabel("Time (h)")
64 |     host.set_ylabel("Battery level (kWh)")
65 |     par1.set_ylabel("Consumption (kW)")
66 |     par2.set_ylabel("Production (kW)")
67 |     par3.set_ylabel("H Actions (kW)")
68 |     
69 |     p1, = host.plot(steps, battery_level, marker='o', lw=1, c = 'b', alpha=0.8, ls='-', label = 'Battery level')
70 |     p2, = par1.plot(steps_long-0.9, np.repeat(consumption,10), lw=3, c = 'r', alpha=0.5, ls='-', label = 'Consumption')
71 |     p3, = par2.plot(steps_long-0.9, np.repeat(production,10), lw=3, c = 'g', alpha=0.5, ls='-', label = 'Production')
72 |     p4, = par3.plot(steps_long, np.repeat(actions,10), lw=3, c = 'c', alpha=0.5, ls='-', label = 'H Actions')
73 |     
74 |     par1.set_ylim(0, 10.09)
75 |     par2.set_ylim(0, 10.09)
76 |     par3.set_ylim(-1.5, 1.5)
77 |     
78 |     #host.legend(loc=2)#loc=9)
79 |     
80 |     host.axis["left"].label.set_color(p1.get_color())
81 |     par1.axis["right"].label.set_color(p2.get_color())
82 |     par2.axis["right"].label.set_color(p3.get_color())
83 |     par3.axis["right"].label.set_color(p4.get_color())
84 |     
85 |     plt.savefig(plot_name)
86 |     
87 |     #plt.draw()
88 |     #plt.show()
89 |     #plt.close('all')
90 | 


--------------------------------------------------------------------------------
/examples/MG_two_storages/run_MG_two_storages.py:
--------------------------------------------------------------------------------
  1 | """2-Storage Microgrid launcher. See the docs for more details about this experiment.
  2 | 
  3 | """
  4 | 
  5 | import sys
  6 | import logging
  7 | import numpy as np
  8 | from joblib import hash, dump, load
  9 | import os
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | import sys
 13 | from os import path
 14 | sys.path.append( path.dirname( path.dirname( path.abspath(__file__) ) ) )
 15 | 
 16 | from deer.default_parser import process_args
 17 | from deer.agent import NeuralAgent
 18 | from deer.learning_algos.q_net_keras import MyQNetwork
 19 | from MG_two_storages_env import MyEnv as MG_two_storages_env
 20 | import deer.experiment.base_controllers as bc
 21 | 
 22 | class Defaults:
 23 |     # ----------------------
 24 |     # Experiment Parameters
 25 |     # ----------------------
 26 |     STEPS_PER_EPOCH = 365*24-1
 27 |     EPOCHS = 200
 28 |     STEPS_PER_TEST = 365*24-1
 29 |     PERIOD_BTW_SUMMARY_PERFS = -1  # Set to -1 for avoiding call to env.summarizePerformance
 30 |     
 31 |     # ----------------------
 32 |     # Environment Parameters
 33 |     # ----------------------
 34 |     FRAME_SKIP = 1
 35 | 
 36 |     # ----------------------
 37 |     # DQN Agent parameters:
 38 |     # ----------------------
 39 |     UPDATE_RULE = 'rmsprop'
 40 |     LEARNING_RATE = 0.0002
 41 |     LEARNING_RATE_DECAY = 0.99
 42 |     DISCOUNT = 0.9
 43 |     DISCOUNT_INC = 0.99
 44 |     DISCOUNT_MAX = 0.98
 45 |     RMS_DECAY = 0.9
 46 |     RMS_EPSILON = 0.0001
 47 |     MOMENTUM = 0
 48 |     CLIP_NORM = 1.0
 49 |     EPSILON_START = 1.0
 50 |     EPSILON_MIN = .3
 51 |     EPSILON_DECAY = 500000
 52 |     UPDATE_FREQUENCY = 1
 53 |     REPLAY_MEMORY_SIZE = 1000000
 54 |     BATCH_SIZE = 32
 55 |     FREEZE_INTERVAL = 1000
 56 |     DETERMINISTIC = False
 57 | 
 58 | 
 59 | 
 60 | 
 61 | if __name__ == "__main__":
 62 |     logging.basicConfig(level=logging.INFO)
 63 |     
 64 |     # --- Parse parameters ---
 65 |     parameters = process_args(sys.argv[1:], Defaults)
 66 | 
 67 |     if parameters.deterministic:
 68 |         rng = np.random.RandomState(123456)
 69 |     else:
 70 |         rng = np.random.RandomState()
 71 |     
 72 |     if(parameters.param1 is not None and parameters.param1!="1"):
 73 |         # We Reduce the size of the time series so that the number of days is divisible by 4*parameters.param1
 74 |         # That way, the number of days in each season is divisible by parameters.param1 and it is thus possible
 75 |         # to reduce the variety of the data within each season in the time series by a factor of parameters.param1
 76 |         parameters.steps_per_epoch=parameters.steps_per_epoch-(parameters.steps_per_epoch%(24*4*int(parameters.param1)))-1
 77 | 
 78 |     # --- Instantiate environment ---
 79 |     env = MG_two_storages_env(rng, parameters.param1, parameters.param2, parameters.param3)
 80 | 
 81 |     # --- Instantiate qnetwork ---
 82 |     qnetwork = MyQNetwork(
 83 |         env,
 84 |         parameters.rms_decay,
 85 |         parameters.rms_epsilon,
 86 |         parameters.momentum,
 87 |         parameters.clip_norm,
 88 |         parameters.freeze_interval,
 89 |         parameters.batch_size,
 90 |         parameters.update_rule,
 91 |         rng)
 92 |     
 93 |     # --- Instantiate agent ---
 94 |     agent = NeuralAgent(
 95 |         env,
 96 |         qnetwork,
 97 |         parameters.replay_memory_size,
 98 |         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
 99 |         parameters.batch_size,
100 |         rng)
101 |     
102 |     # --- Create unique filename for FindBestController ---
103 |     h = hash(vars(parameters), hash_name="sha1")
104 |     fname = "MG2S_" + h
105 |     print("The parameters hash is: {}".format(h))
106 |     print("The parameters are: {}".format(parameters))
107 | 
108 |     # --- Bind controllers to the agent ---
109 |     # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
110 |     # learning rate as well as the training epoch number.
111 |     agent.attach(bc.VerboseController(
112 |         evaluate_on='epoch',
113 |         periodicity=1))
114 |     
115 |     # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
116 |     # Plus, we also want to display after each training episode (!= than after every training) the average bellman
117 |     # residual and the average of the V values obtained during the last episode, hence the two last arguments.
118 |     agent.attach(bc.TrainerController(
119 |         evaluate_on='action',
120 |         periodicity=parameters.update_frequency,
121 |         show_episode_avg_V_value=True,
122 |         show_avg_Bellman_residual=True))
123 |     
124 |     # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
125 |     # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
126 |     agent.attach(bc.LearningRateController(
127 |         initial_learning_rate=parameters.learning_rate,
128 |         learning_rate_decay=parameters.learning_rate_decay,
129 |         periodicity=1))
130 |     
131 |     # Same for the discount factor.
132 |     agent.attach(bc.DiscountFactorController(
133 |         initial_discount_factor=parameters.discount,
134 |         discount_factor_growth=parameters.discount_inc,
135 |         discount_factor_max=parameters.discount_max,
136 |         periodicity=1))
137 |     
138 |     # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
139 |     # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
140 |     # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
141 |     # episode or epoch (or never, hence the resetEvery='none').
142 |     agent.attach(bc.EpsilonController(
143 |         initial_e=parameters.epsilon_start,
144 |         e_decays=parameters.epsilon_decay,
145 |         e_min=parameters.epsilon_min,
146 |         evaluate_on='action',
147 |         periodicity=1,
148 |         reset_every='none'))
149 | 
150 |     # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
151 |     # seems to generalize the best, thus which one has the highest validation score. However we also want to keep 
152 |     # track of a "true generalization score", the "test score". Indeed, what if we overfit the validation score ?
153 |     # To achieve these goals, one can use the FindBestController along two InterleavedTestEpochControllers, one for
154 |     # each mode (validation and test). It is important that the validationID and testID are the same than the id 
155 |     # argument of the two InterleavedTestEpochControllers (implementing the validation mode and test mode 
156 |     # respectively). The FindBestController will dump on disk the validation and test scores for each and every 
157 |     # network, as well as the structure of the neural network having the best validation score. These dumps can then
158 |     # used to plot the evolution of the validation and test scores (see below) or simply recover the resulting neural 
159 |     # network for your application.
160 |     agent.attach(bc.FindBestController(
161 |         validationID=env.VALIDATION_MODE,
162 |         testID=env.TEST_MODE,
163 |         unique_fname=fname))
164 |     
165 |     # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
166 |     # "validation epoch" between each training epoch (hence the periodicity=1). For each validation epoch, we want also 
167 |     # to  display the sum of all rewards obtained, hence the showScore=True. Finally, we never want this controller to 
168 |     # call the summarizePerformance method of MG_two_storage_env.
169 |     agent.attach(bc.InterleavedTestEpochController(
170 |         id=env.VALIDATION_MODE,
171 |         epoch_length=parameters.steps_per_epoch,
172 |         periodicity=1, 
173 |         show_score=True,
174 |         summarize_every=-1))
175 |     
176 |     # Besides inserting a validation epoch (required if one wants to find the best neural network over all training
177 |     # epochs), we also wish to interleave a "test epoch" between each training epoch. For each test epoch, we also
178 |     # want to display the sum of all rewards obtained, hence the showScore=True. Finally, we want to call the 
179 |     # summarizePerformance method of MG_two_storage_env every [parameters.period_btw_summary_perfs] *test* epochs.
180 |     agent.attach(bc.InterleavedTestEpochController(
181 |         id=env.TEST_MODE,
182 |         epoch_length=parameters.steps_per_test,
183 |         periodicity=1,
184 |         show_score=True,
185 |         summarize_every=parameters.period_btw_summary_perfs))
186 |     
187 |     # --- Run the experiment ---
188 |     try:
189 |         os.mkdir("params")
190 |     except Exception:
191 |         pass
192 |     dump(vars(parameters), "params/" + fname + ".jldump")
193 |             
194 |     agent.run(parameters.epochs, parameters.steps_per_epoch)
195 |     
196 |     # --- Show results ---
197 |     basename = "scores/" + fname
198 |     scores = load(basename + "_scores.jldump")
199 |     plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
200 |     plt.plot(range(1, len(scores['ts'])+1), scores['ts'], label="TS", color='r')
201 |     plt.legend()
202 |     plt.xlabel("Number of epochs")
203 |     plt.ylabel("Score")
204 |     plt.savefig(basename + "_scores.pdf")
205 |     plt.show()
206 | 


--------------------------------------------------------------------------------
/examples/gym/mountain_car_continuous_env.py:
--------------------------------------------------------------------------------
  1 | """ Mountain car environment with continuous action space.
  2 | 
  3 | Author: Vincent Francois-Lavet
  4 | """
  5 | 
  6 | import numpy as np
  7 | import copy
  8 | import math
  9 | from deer.base_classes import Environment
 10 | import gym
 11 | 
 12 | class MyEnv(Environment):
 13 |     def __init__(self, rng):
 14 |         """ Initialize environment.
 15 | 
 16 |         Parameters
 17 |         -----------
 18 |             rng : numpy random number generator
 19 |         """
 20 |         self.env = gym.make('MountainCarContinuous-v0')
 21 |         self.rng=rng
 22 |         self._last_observation = self.reset()
 23 |         self.is_terminal=False
 24 |         self._input_dim = [(1,), (1,)]
 25 |         
 26 |     def act(self, action):
 27 |         """ Simulate one time step in the environment and returns the reward for the time step
 28 |         
 29 |         Parameters
 30 |         -----------
 31 |         action : list of floats (in this case one float, because there is one action)
 32 | 
 33 |         Returns
 34 |         -------
 35 |         reward : float
 36 |         """
 37 |         reward=0
 38 |         for _ in range(10): # Increase the duration of one time step by a factor 10
 39 |             self._last_observation, r, self.is_terminal, info = self.env.step([action[0]])
 40 |             reward+=r
 41 |             if(self.is_terminal==True):
 42 |                 break
 43 |                 
 44 |             if (self.mode==0): # Show the policy only at test time
 45 |                 try:
 46 |                     self.env.render()
 47 |                 except:
 48 |                     pass
 49 |                 
 50 |         return reward/100. #Scale the reward so that it's 1 at maximum
 51 |                 
 52 |     def reset(self, mode=0):
 53 |         """ Reset environment for a new episode.
 54 | 
 55 |         Parameters
 56 |         -----------
 57 |         Mode : int
 58 |             -1 corresponds to training and 0 to test
 59 |         """
 60 |         self.mode=mode
 61 |         
 62 |         self._last_observation = self.env.reset()
 63 | 
 64 |         self.is_terminal=False
 65 | 
 66 |         return self._last_observation
 67 |                 
 68 |     def inTerminalState(self):
 69 |         """ This returns whether the environment reached a terminal state after the last transition 
 70 |         (i.e. whether the last transition that occurred was terminal).
 71 | 
 72 |         Returns
 73 |         -------
 74 |         self.is_terminal : bool
 75 |         """
 76 |         return self.is_terminal
 77 | 
 78 |     def inputDimensions(self):
 79 |         return self._input_dim  
 80 | 
 81 |     def nActions(self):
 82 |         """ Provides the bounds on the action space
 83 | 
 84 |         Returns
 85 |         -------
 86 |         bounds on the action space
 87 |         """
 88 |         return [[self.env.action_space.low[0],self.env.action_space.high[0]]]
 89 | 
 90 |     def observe(self):
 91 |         return copy.deepcopy(self._last_observation)
 92 |         
 93 | def main():
 94 |     # This function can be used for debug purposes
 95 |     rng = np.random.RandomState(123456)
 96 |     myenv=MyEnv(rng)
 97 |     print(myenv.env.action_space)
 98 |     print(myenv.env.action_space.low)
 99 |     print(myenv.env.action_space.high)    
100 |     print(myenv.env.observation_space)
101 |     
102 |     print (myenv.observe())
103 |     myenv.act([0])
104 |     print (myenv.observe())
105 |     myenv.act([1])
106 |     print (myenv.observe())
107 |     
108 |     
109 | if __name__ == "__main__":
110 |     main()
111 | 


--------------------------------------------------------------------------------
/examples/gym/mountain_car_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import copy
  3 | import math
  4 | from deer.base_classes import Environment
  5 | import gym
  6 | import sys
  7 | 
  8 | class MyEnv(Environment):
  9 |     def __init__(self, rng):
 10 |         """ Initialize environment.
 11 | 
 12 |         Arguments:
 13 |             rng - the numpy random number generator            
 14 |         """
 15 |         gym.envs.register(
 16 |         id='MountainCarModified-v0',
 17 |         entry_point='gym.envs.classic_control:MountainCarEnv',
 18 |         max_episode_steps=500,      # MountainCar-v0 uses 200
 19 |         reward_threshold=-110.0,
 20 |         )
 21 | 
 22 |         self.env = gym.make('MountainCarModified-v0')
 23 |         self.env.max_episode_steps = 500
 24 |         self.rng=rng
 25 |         self._last_observation = self.env.reset()
 26 |         self.is_terminal=False
 27 |         self._input_dim = [(1,), (1,)]      # self.env.observation_space.shape is equal to 2
 28 |                                             # and we use only the current observation in the pseudo-state
 29 | 
 30 |     def act(self, action):
 31 |         """ Simulate one time step in the environment.
 32 |         """
 33 |         reward=0
 34 |         nsteps=10
 35 |         for _ in range(nsteps):
 36 |             self._last_observation, r, self.is_terminal, info = self.env.step(action)
 37 |             reward+=r
 38 |             if(self.is_terminal==True):
 39 |                 reward+=3*nsteps
 40 |                 break
 41 |         
 42 |             if (self.mode==0): # Show the policy only at test time
 43 |                 try:
 44 |                     self.env.render()
 45 |                 except:
 46 |                     pass
 47 |                     #print("Warning:", sys.exc_info()[0])
 48 | 
 49 |         #s=copy.deepcopy(self._last_observation)
 50 |         ## Possibility to add a reward shaping for faster convergence   
 51 |         #s[0]+=math.pi/6
 52 |         #if(s[0]>0):
 53 |         #    reward+=pow(s[0],2)#np.linalg.norm(s[0])
 54 | 
 55 |         return reward/nsteps
 56 |                 
 57 |     def reset(self, mode=0):
 58 |         """ Reset environment for a new episode.
 59 | 
 60 |         Arguments:
 61 |         Mode : int
 62 |             -1 corresponds to training and 0 to test
 63 |         """
 64 |         self.mode=mode
 65 |         
 66 |         self._last_observation = self.env.reset()
 67 |         # DEEPRECATED
 68 |         #if (self.mode==-1): # Reset to a random value when in training mode (that allows to increase exploration)
 69 |         #    high=self.env.observation_space.high
 70 |         #    low=self.env.observation_space.low
 71 |         #    self._last_observation=low+self.rng.rand(2)*(high-low)
 72 |         #    self.env.env.state=self._last_observation
 73 | 
 74 |         self.is_terminal=False
 75 |         
 76 | 
 77 |         return self._last_observation
 78 |                 
 79 |     def inTerminalState(self):
 80 |         """ Tell whether the environment reached a terminal state after the last transition (i.e. the last transition 
 81 |         that occured was terminal).
 82 |         """
 83 |         return self.is_terminal
 84 | 
 85 |     def inputDimensions(self):
 86 |         return self._input_dim  
 87 | 
 88 |     def nActions(self):
 89 |         return 3 #Would be useful to have this directly in gym : self.env.action_space.shape  
 90 | 
 91 |     def observe(self):
 92 |         return copy.deepcopy(self._last_observation)
 93 |         
 94 | def main():
 95 |     # This function can be used for debug purposes
 96 |     rng = np.random.RandomState(123456)
 97 |     myenv=MyEnv(rng)
 98 | 
 99 |     print (myenv.observe())
100 |     
101 | if __name__ == "__main__":
102 |     main()
103 | 


--------------------------------------------------------------------------------
/examples/gym/pendulum_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | 
 4 | from deer.base_classes import Environment
 5 | import gym
 6 | 
 7 | class MyEnv(Environment):
 8 |     def __init__(self, rng):
 9 |         """ Initialize environment.
10 | 
11 |         Arguments:
12 |             rng - the numpy random number generator            
13 |         """
14 |         # Defining the type of environment
15 |         self.env = gym.make('CartPole-v0')
16 |         self._last_observation = self.env.reset()
17 |         self.is_terminal=False
18 |         self._input_dim = [(1,), (1,), (1,), (1,)]  # self.env.observation_space.shape is equal to 4 
19 |                                                     # and we use only the current observations in the pseudo-state
20 | 
21 |     def act(self, action):
22 |         """ Simulate one time step in the environment.
23 |         """
24 |         
25 |         self._last_observation, reward, self.is_terminal, info = self.env.step(action)
26 |         if (self.mode==0): # Show the policy only at test time
27 |             self.env.render()
28 |             
29 |         return reward
30 |                 
31 |     def reset(self, mode=0):
32 |         """ Reset environment for a new episode.
33 | 
34 |         Arguments:
35 |         Mode : int
36 |             -1 corresponds to training and 0 to test
37 |         """
38 |         # Reset initial observation to a random x and theta
39 |         self._last_observation = self.env.reset()
40 |         self.is_terminal=False
41 |         self.mode=mode
42 | 
43 |         return self._last_observation
44 |                 
45 |     def inTerminalState(self):
46 |         """Tell whether the environment reached a terminal state after the last transition (i.e. the last transition 
47 |         that occured was terminal).
48 |         """
49 |         return self.is_terminal
50 | 
51 |     def inputDimensions(self):
52 |         return self._input_dim  
53 | 
54 |     def nActions(self):
55 |         return 2 #Would be useful to have this directly in gym : self.env.action_space.shape  
56 | 
57 |     def observe(self):
58 |         return copy.deepcopy(self._last_observation)
59 |         
60 | def main():
61 |     rng = np.random.RandomState(123456)
62 |     myenv=MyEnv(rng)
63 | 
64 |     print (myenv.observe())
65 |     
66 | if __name__ == "__main__":
67 |     main()
68 | 


--------------------------------------------------------------------------------
/examples/gym/run_mountain_car.py:
--------------------------------------------------------------------------------
  1 | """ Mountain car environment launcher.
  2 | Same principles as run_toy_env. See the docs for more details.
  3 | 
  4 | """
  5 | 
  6 | import sys
  7 | import logging
  8 | import numpy as np
  9 | 
 10 | import deer.experiment.base_controllers as bc
 11 | from deer.default_parser import process_args
 12 | from deer.agent import NeuralAgent
 13 | from deer.learning_algos.q_net_keras import MyQNetwork
 14 | from mountain_car_env import MyEnv as mountain_car_env
 15 | from deer.policies import EpsilonGreedyPolicy,LongerExplorationPolicy
 16 | 
 17 | class Defaults:
 18 |     # ----------------------
 19 |     # Experiment Parameters
 20 |     # ----------------------
 21 |     STEPS_PER_EPOCH = 200
 22 |     EPOCHS = 200
 23 |     STEPS_PER_TEST = 200
 24 |     PERIOD_BTW_SUMMARY_PERFS = 10
 25 | 
 26 |     # ----------------------
 27 |     # Environment Parameters
 28 |     # ----------------------
 29 |     FRAME_SKIP = 1
 30 | 
 31 |     # ----------------------
 32 |     # DQN Agent parameters:
 33 |     # ----------------------
 34 |     UPDATE_RULE = 'rmsprop'
 35 |     LEARNING_RATE = 0.005
 36 |     LEARNING_RATE_DECAY = 0.99
 37 |     DISCOUNT = 0.9
 38 |     DISCOUNT_INC = 0.99
 39 |     DISCOUNT_MAX = 0.95
 40 |     RMS_DECAY = 0.9
 41 |     RMS_EPSILON = 0.0001
 42 |     MOMENTUM = 0
 43 |     CLIP_NORM = 1.0
 44 |     EPSILON_START = 1.0
 45 |     EPSILON_MIN = 0.2
 46 |     EPSILON_DECAY = 10000
 47 |     UPDATE_FREQUENCY = 1
 48 |     REPLAY_MEMORY_SIZE = 1000000
 49 |     BATCH_SIZE = 32
 50 |     FREEZE_INTERVAL = 100
 51 |     DETERMINISTIC = True
 52 | 
 53 | if __name__ == "__main__":
 54 |     logging.basicConfig(level=logging.INFO)
 55 | 
 56 |     # --- Parse parameters ---
 57 |     parameters = process_args(sys.argv[1:], Defaults)
 58 |     if parameters.deterministic:
 59 |         rng = np.random.RandomState(12345)
 60 |     else:
 61 |         rng = np.random.RandomState()
 62 |     
 63 |     # --- Instantiate environment ---
 64 |     env = mountain_car_env(rng)
 65 | 
 66 |     # --- Instantiate qnetwork ---
 67 |     qnetwork = MyQNetwork(
 68 |         env,
 69 |         parameters.rms_decay,
 70 |         parameters.rms_epsilon,
 71 |         parameters.momentum,
 72 |         parameters.clip_norm,
 73 |         parameters.freeze_interval,
 74 |         parameters.batch_size,
 75 |         parameters.update_rule,
 76 |         rng,
 77 |         double_Q=True)
 78 |     
 79 |     train_policy = LongerExplorationPolicy(qnetwork, env.nActions(), rng, 1.0)#EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.)
 80 |     test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.)
 81 | 
 82 |     # --- Instantiate agent ---
 83 |     agent = NeuralAgent(
 84 |         env,
 85 |         qnetwork,
 86 |         parameters.replay_memory_size,
 87 |         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
 88 |         parameters.batch_size,
 89 |         rng,
 90 |         exp_priority=1.,
 91 |         train_policy=train_policy,
 92 |         test_policy=test_policy)
 93 | 
 94 |     # --- Bind controllers to the agent ---
 95 |     # For comments, please refer to run_toy_env.py
 96 |     agent.attach(bc.VerboseController(
 97 |         evaluate_on='epoch', 
 98 |         periodicity=1))
 99 | 
100 |     agent.attach(bc.TrainerController(
101 |         evaluate_on='action', 
102 |         periodicity=parameters.update_frequency, 
103 |         show_episode_avg_V_value=True, 
104 |         show_avg_Bellman_residual=True))
105 | 
106 |     agent.attach(bc.LearningRateController(
107 |         initial_learning_rate=parameters.learning_rate,
108 |         learning_rate_decay=parameters.learning_rate_decay,
109 |         periodicity=1))
110 | 
111 |     agent.attach(bc.DiscountFactorController(
112 |         initial_discount_factor=parameters.discount,
113 |         discount_factor_growth=parameters.discount_inc,
114 |         discount_factor_max=parameters.discount_max,
115 |         periodicity=1))
116 | 
117 |     agent.attach(bc.EpsilonController(
118 |         initial_e=parameters.epsilon_start, 
119 |         e_decays=parameters.epsilon_decay, 
120 |         e_min=parameters.epsilon_min,
121 |         evaluate_on='action', 
122 |         periodicity=1, 
123 |         reset_every='none'))
124 | 
125 |     agent.attach(bc.InterleavedTestEpochController(
126 |         id=0, 
127 |         epoch_length=parameters.steps_per_test, 
128 |         periodicity=1, 
129 |         show_score=True,
130 |         summarize_every=parameters.period_btw_summary_perfs))
131 |     
132 |     # --- Run the experiment ---
133 |     agent.run(parameters.epochs, parameters.steps_per_epoch)
134 | 


--------------------------------------------------------------------------------
/examples/gym/run_mountain_car_continuous.py:
--------------------------------------------------------------------------------
  1 | """ Launcher for mountain car environment with continuous action space.
  2 | Same principles as run_toy_env. See the wiki for more details.
  3 | 
  4 | """
  5 | 
  6 | import sys
  7 | import logging
  8 | import numpy as np
  9 | 
 10 | import deer.experiment.base_controllers as bc
 11 | from deer.default_parser import process_args
 12 | from deer.agent import NeuralAgent
 13 | from deer.learning_algos.AC_net_keras import MyACNetwork
 14 | from mountain_car_continuous_env import MyEnv as mountain_car_continuous_env
 15 | from deer.policies import LongerExplorationPolicy
 16 | 
 17 | 
 18 | class Defaults:
 19 |     # ----------------------
 20 |     # Experiment Parameters
 21 |     # ----------------------
 22 |     STEPS_PER_EPOCH = 200
 23 |     EPOCHS = 200
 24 |     STEPS_PER_TEST = 200
 25 |     PERIOD_BTW_SUMMARY_PERFS = 10
 26 | 
 27 |     # ----------------------
 28 |     # Environment Parameters
 29 |     # ----------------------
 30 |     FRAME_SKIP = 1
 31 | 
 32 |     # ----------------------
 33 |     # DQN Agent parameters:
 34 |     # ----------------------
 35 |     UPDATE_RULE = 'rmsprop'
 36 |     LEARNING_RATE = 0.002
 37 |     LEARNING_RATE_DECAY = 0.99
 38 |     DISCOUNT = 0.9
 39 |     DISCOUNT_INC = 0.99
 40 |     DISCOUNT_MAX = 0.95
 41 |     RMS_DECAY = 0.9
 42 |     RMS_EPSILON = 0.0001
 43 |     MOMENTUM = 0
 44 |     CLIP_NORM = 1.0
 45 |     EPSILON_START = 1.0
 46 |     EPSILON_MIN = 0.2
 47 |     EPSILON_DECAY = 10000
 48 |     UPDATE_FREQUENCY = 1
 49 |     REPLAY_MEMORY_SIZE = 1000000
 50 |     BATCH_SIZE = 32
 51 |     FREEZE_INTERVAL = 100
 52 |     DETERMINISTIC = True
 53 | 
 54 | if __name__ == "__main__":
 55 |     logging.basicConfig(level=logging.INFO)
 56 | 
 57 |     # --- Parse parameters ---
 58 |     parameters = process_args(sys.argv[1:], Defaults)
 59 |     if parameters.deterministic:
 60 |         rng = np.random.RandomState(12345)
 61 |     else:
 62 |         rng = np.random.RandomState()
 63 |     
 64 |     # --- Instantiate environment ---
 65 |     env = mountain_car_continuous_env(rng)
 66 | 
 67 |     # --- Instantiate qnetwork ---
 68 |     qnetwork = MyACNetwork(
 69 |         env,
 70 |         parameters.rms_decay,
 71 |         parameters.rms_epsilon,
 72 |         parameters.momentum,
 73 |         parameters.clip_norm,
 74 |         parameters.freeze_interval,
 75 |         parameters.batch_size,
 76 |         parameters.update_rule,
 77 |         rng)
 78 |     
 79 |     train_policy=LongerExplorationPolicy(qnetwork, env.nActions(), rng, 1.,10)
 80 |     
 81 |     # --- Instantiate agent ---
 82 |     agent = NeuralAgent(
 83 |         env,
 84 |         qnetwork,
 85 |         parameters.replay_memory_size,
 86 |         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
 87 |         parameters.batch_size,
 88 |         rng,
 89 |         exp_priority=1.,
 90 |         train_policy=train_policy)
 91 | 
 92 |     # --- Bind controllers to the agent ---
 93 |     # For comments, please refer to run_toy_env.py
 94 |     agent.attach(bc.VerboseController(
 95 |         evaluate_on='epoch', 
 96 |         periodicity=1))
 97 | 
 98 |     agent.attach(bc.TrainerController(
 99 |         evaluate_on='action', 
100 |         periodicity=parameters.update_frequency, 
101 |         show_episode_avg_V_value=True, 
102 |         show_avg_Bellman_residual=True))
103 | 
104 |     agent.attach(bc.LearningRateController(
105 |         initial_learning_rate=parameters.learning_rate,
106 |         learning_rate_decay=parameters.learning_rate_decay,
107 |         periodicity=1))
108 | 
109 |     agent.attach(bc.DiscountFactorController(
110 |         initial_discount_factor=parameters.discount,
111 |         discount_factor_growth=parameters.discount_inc,
112 |         discount_factor_max=parameters.discount_max,
113 |         periodicity=1))
114 | 
115 |     agent.attach(bc.EpsilonController(
116 |         initial_e=parameters.epsilon_start, 
117 |         e_decays=parameters.epsilon_decay, 
118 |         e_min=parameters.epsilon_min,
119 |         evaluate_on='action', 
120 |         periodicity=1, 
121 |         reset_every='none'))
122 | 
123 |     agent.attach(bc.InterleavedTestEpochController(
124 |         id=0, 
125 |         epoch_length=parameters.steps_per_test, 
126 |         periodicity=1, 
127 |         show_score=True,
128 |         summarize_every=parameters.period_btw_summary_perfs))
129 |     
130 |     # --- Run the experiment ---
131 |     agent.run(parameters.epochs, parameters.steps_per_epoch)
132 | 


--------------------------------------------------------------------------------
/examples/gym/run_pendulum.py:
--------------------------------------------------------------------------------
  1 | """ Pendulum environment launcher.
  2 | Same principles as run_toy_env. See the docs for more details.
  3 | 
  4 | Authors: Vincent Francois-Lavet, David Taralla
  5 | """
  6 | 
  7 | import sys
  8 | import logging
  9 | import numpy as np
 10 | 
 11 | import deer.experiment.base_controllers as bc
 12 | from deer.default_parser import process_args
 13 | from deer.agent import NeuralAgent
 14 | from deer.learning_algos.q_net_keras import MyQNetwork
 15 | from pendulum_env import MyEnv as pendulum_env
 16 | 
 17 | class Defaults:
 18 |     # ----------------------
 19 |     # Experiment Parameters
 20 |     # ----------------------
 21 |     STEPS_PER_EPOCH = 100
 22 |     EPOCHS = 200
 23 |     STEPS_PER_TEST = 100
 24 |     PERIOD_BTW_SUMMARY_PERFS = 10
 25 | 
 26 |     # ----------------------
 27 |     # Environment Parameters
 28 |     # ----------------------
 29 |     FRAME_SKIP = 1
 30 | 
 31 |     # ----------------------
 32 |     # DQN Agent parameters:
 33 |     # ----------------------
 34 |     UPDATE_RULE = 'rmsprop'
 35 |     LEARNING_RATE = 0.0002
 36 |     LEARNING_RATE_DECAY = 0.99
 37 |     DISCOUNT = 0.9
 38 |     DISCOUNT_INC = 1.
 39 |     DISCOUNT_MAX = 0.95
 40 |     RMS_DECAY = 0.9
 41 |     RMS_EPSILON = 0.0001
 42 |     MOMENTUM = 0
 43 |     CLIP_NORM = 1.0
 44 |     EPSILON_START = 1.0
 45 |     EPSILON_MIN = 0.2
 46 |     EPSILON_DECAY = 10000
 47 |     UPDATE_FREQUENCY = 1
 48 |     REPLAY_MEMORY_SIZE = 1000000
 49 |     BATCH_SIZE = 32
 50 |     FREEZE_INTERVAL = 500
 51 |     DETERMINISTIC = True
 52 | 
 53 | if __name__ == "__main__":
 54 |     logging.basicConfig(level=logging.INFO)
 55 | 
 56 |     # --- Parse parameters ---
 57 |     parameters = process_args(sys.argv[1:], Defaults)
 58 |     if parameters.deterministic:
 59 |         rng = np.random.RandomState(12345)
 60 |     else:
 61 |         rng = np.random.RandomState()
 62 |     
 63 |     # --- Instantiate environment ---
 64 |     env = pendulum_env(rng)
 65 | 
 66 |     # --- Instantiate qnetwork ---
 67 |     qnetwork = MyQNetwork(
 68 |         env,
 69 |         parameters.rms_decay,
 70 |         parameters.rms_epsilon,
 71 |         parameters.momentum,
 72 |         parameters.clip_norm,
 73 |         parameters.freeze_interval,
 74 |         parameters.batch_size,
 75 |         parameters.update_rule,
 76 |         rng,
 77 |         double_Q=True)
 78 |     
 79 |     # --- Instantiate agent ---
 80 |     agent = NeuralAgent(
 81 |         env,
 82 |         qnetwork,
 83 |         parameters.replay_memory_size,
 84 |         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
 85 |         parameters.batch_size,
 86 |         rng)
 87 | 
 88 |     # --- Bind controllers to the agent ---
 89 |     # For comments, please refer to run_toy_env.py
 90 |     agent.attach(bc.VerboseController(
 91 |         evaluate_on='epoch', 
 92 |         periodicity=1))
 93 | 
 94 |     agent.attach(bc.TrainerController(
 95 |         evaluate_on='action',
 96 |         periodicity=parameters.update_frequency, 
 97 |         show_episode_avg_V_value=False, 
 98 |         show_avg_Bellman_residual=False))
 99 | 
100 |     agent.attach(bc.LearningRateController(
101 |         initial_learning_rate=parameters.learning_rate,
102 |         learning_rate_decay=parameters.learning_rate_decay,
103 |         periodicity=1))
104 | 
105 |     agent.attach(bc.DiscountFactorController(
106 |         initial_discount_factor=parameters.discount,
107 |         discount_factor_growth=parameters.discount_inc,
108 |         discount_factor_max=parameters.discount_max,
109 |         periodicity=1))
110 | 
111 |     agent.attach(bc.EpsilonController(
112 |         initial_e=parameters.epsilon_start, 
113 |         e_decays=parameters.epsilon_decay, 
114 |         e_min=parameters.epsilon_min,
115 |         evaluate_on='action', 
116 |         periodicity=1, 
117 |         reset_every='none'))
118 | 
119 |     agent.attach(bc.InterleavedTestEpochController(
120 |         id=0, 
121 |         epoch_length=parameters.steps_per_test, 
122 |         periodicity=1, 
123 |         show_score=True,
124 |         summarize_every=parameters.period_btw_summary_perfs))
125 |     
126 |     # --- Run the experiment ---
127 |     agent.run(parameters.epochs, parameters.steps_per_epoch)
128 | 


--------------------------------------------------------------------------------
/examples/maze/a_star_path_finding.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Code from https://github.com/laurentluce/python-algorithms/blob/master/algorithms/a_star_path_finding.py
  3 | """
  4 | 
  5 | import heapq
  6 | import numpy as np
  7 | 
  8 | class Cell(object):
  9 |     def __init__(self, x, y, reachable):
 10 |         """Initialize new cell.
 11 | 
 12 |         @param reachable is cell reachable? not a wall?
 13 |         @param x cell x coordinate
 14 |         @param y cell y coordinate
 15 |         @param g cost to move from the starting cell to this cell.
 16 |         @param h estimation of the cost to move from this cell
 17 |                  to the ending cell.
 18 |         @param f f = g + h
 19 |         """
 20 |         self.reachable = reachable
 21 |         self.x = x
 22 |         self.y = y
 23 |         self.parent = None
 24 |         self.g = 0
 25 |         self.h = 0
 26 |         self.f = 0
 27 | 
 28 |     def __lt__(self, other):
 29 |         return self.f < other.f
 30 |         
 31 | #class Cell(object):
 32 | #    def __init__(self, x, y, reachable):
 33 | #        """Initialize new cell.
 34 | #        @param reachable is cell reachable? not a wall?
 35 | #        @param x cell x coordinate
 36 | #        @param y cell y coordinate
 37 | #        @param g cost to move from the starting cell to this cell.
 38 | #        @param h estimation of the cost to move from this cell
 39 | #                 to the ending cell.
 40 | #        @param f f = g + h
 41 | #        """
 42 | #        self.reachable = reachable
 43 | ##        self.occupied = False
 44 | #        self.x = x
 45 | #        self.y = y
 46 | #        self.parent = None
 47 | #        self.g = 0
 48 | #        self.h = 0
 49 | #        self.f = 0
 50 | 
 51 | 
 52 | class AStar(object):
 53 |     def __init__(self):
 54 |         # open list
 55 |         self.opened = []
 56 |         heapq.heapify(self.opened)
 57 |         # visited cells list
 58 |         self.closed = set()
 59 |         # grid cells
 60 |         self.cells = []
 61 |         self.grid_height = None
 62 |         self.grid_width = None
 63 | 
 64 |     def init_grid(self, width, height, walls, start, end):
 65 |         """Prepare grid cells, walls.
 66 |         @param width grid's width.
 67 |         @param height grid's height.
 68 |         @param walls list of wall x,y tuples.
 69 |         @param start grid starting point x,y tuple.
 70 |         @param end grid ending point x,y tuple.
 71 |         """
 72 |         self.grid_height = height
 73 |         self.grid_width = width
 74 |         for x in range(self.grid_width):
 75 |             for y in range(self.grid_height):
 76 |                 if (x, y) in walls:
 77 |                     reachable = False
 78 |                 else:
 79 |                     reachable = True
 80 |                 self.cells.append(Cell(x, y, reachable))
 81 |         self.start = self.get_cell(*start)
 82 |         self.start.reachable=True
 83 |         
 84 |         self.agent_cell=self.start
 85 |         self.end = self.get_cell(*end)
 86 | 
 87 |     def get_heuristic(self, cell):
 88 |         """Compute the heuristic value H for a cell.
 89 |         Distance between this cell and the ending cell multiply by 10.
 90 |         @returns heuristic value H
 91 |         """
 92 |         return 10 * (abs(cell.x - self.end.x) + abs(cell.y - self.end.y))
 93 | 
 94 |     def get_cell(self, x, y):
 95 |         """Returns a cell from the cells list.
 96 |         @param x cell x coordinate
 97 |         @param y cell y coordinate
 98 |         @returns cell
 99 |         """
100 |         return self.cells[x * self.grid_height + y]
101 | 
102 |     def get_adjacent_cells(self, cell):
103 |         """Returns adjacent cells to a cell.
104 |         Clockwise starting from the one on the right.
105 |         @param cell get adjacent cells for this cell
106 |         @returns adjacent cells list.
107 |         """
108 |         cells = []
109 |         if cell.x < self.grid_width-1:
110 |             cells.append(self.get_cell(cell.x+1, cell.y))
111 |         if cell.y > 0:
112 |             cells.append(self.get_cell(cell.x, cell.y-1))
113 |         if cell.x > 0:
114 |             cells.append(self.get_cell(cell.x-1, cell.y))
115 |         if cell.y < self.grid_height-1:
116 |             cells.append(self.get_cell(cell.x, cell.y+1))
117 |         return cells
118 | 
119 |     def get_path(self):
120 |         cell = self.end
121 |         path = [(cell.x, cell.y)]
122 |         while cell.parent is not self.start:
123 |             cell = cell.parent
124 |             path.append((cell.x, cell.y))
125 | 
126 |         path.append((self.start.x, self.start.y))
127 |         path.reverse()
128 |         return path
129 | 
130 |     def update_cell(self, adj, cell):
131 |         """Update adjacent cell.
132 |         @param adj adjacent cell to current cell
133 |         @param cell current cell being processed
134 |         """
135 |         adj.g = cell.g + 10
136 |         adj.h = self.get_heuristic(adj)
137 |         adj.parent = cell
138 |         adj.f = adj.h + adj.g
139 | 
140 |     def solve(self):
141 |         """Solve maze, find path to ending cell.
142 |         @returns path or None if not found.
143 |         """
144 |         # add starting cell to open heap queue
145 |         heapq.heappush(self.opened, (self.start.f, self.start))
146 |         while len(self.opened):
147 |             # pop cell from heap queue
148 |             f, cell = heapq.heappop(self.opened)
149 |             # add cell to closed list so we don't process it twice
150 |             self.closed.add(cell)
151 |             # if ending cell, return found path
152 |             if cell is self.end:
153 |                 return self.get_path()
154 |             # get adjacent cells for cell
155 |             adj_cells = self.get_adjacent_cells(cell)
156 |             for adj_cell in adj_cells:
157 |                 if adj_cell.reachable and adj_cell not in self.closed:
158 |                     if (adj_cell.f, adj_cell) in self.opened:
159 |                         # if adj cell in open list, check if current path is
160 |                         # better than the one previously found
161 |                         # for this adj cell.
162 |                         if adj_cell.g > cell.g + 10:
163 |                             self.update_cell(adj_cell, cell)
164 |                     else:
165 |                         self.update_cell(adj_cell, cell)
166 |                         # add adj cell to open list
167 |                         heapq.heappush(self.opened, (adj_cell.f, adj_cell))
168 |                                                 
169 |     def get_maze_array(self):
170 |         maze=[]
171 |         for i in range(self.grid_height):
172 |             row=[]
173 |             for j in range(self.grid_width):
174 |                 if(self.get_cell(i, j) is self.agent_cell):
175 |                     row.append(1)
176 |                 elif(self.get_cell(i, j) is self.end):
177 |                     row.append(2)
178 |                 elif(self.get_cell(i, j).reachable==True):
179 |                     row.append(0)
180 |                 else:
181 |                     row.append(-1)
182 |             maze.append(row)
183 |         
184 |         return maze


--------------------------------------------------------------------------------
/examples/maze/maze_env.py:
--------------------------------------------------------------------------------
  1 | """ Environment with a distribution of mazes (one new maze is drawn at each episode)
  2 | 
  3 | Author: Vincent Francois-Lavet
  4 | """
  5 | import numpy as np
  6 | 
  7 | from deer.base_classes import Environment
  8 | 
  9 | #import matplotlib
 10 | #matplotlib.use('qt5agg')
 11 | #from mpl_toolkits.axes_grid1 import host_subplot
 12 | #import mpl_toolkits.axisartist as AA
 13 | #import matplotlib.pyplot as plt
 14 | import copy 
 15 | import a_star_path_finding as pf
 16 | 
 17 | class MyEnv(Environment):
 18 |     VALIDATION_MODE = 0
 19 | 
 20 |     def __init__(self, rng, **kwargs):
 21 | 
 22 |         self._random_state = rng
 23 |         self._mode = -1
 24 |         self._mode_score = 0.0
 25 |         self._mode_episode_count = 0
 26 |         self._episode_steps = 0
 27 |         self._actions = [0,1,2,3]
 28 |         self._size_maze = 8
 29 |         self._higher_dim_obs=kwargs.get('higher_dim_obs',False)
 30 |         self._reverse=kwargs.get('reverse',False)
 31 | 
 32 |         self._n_walls = int((self._size_maze-2)**2/3.)#int((self._size_maze)**2/3.)
 33 |         self._n_rewards = 3
 34 |         self.create_map()
 35 |         self.intern_dim=3
 36 |         
 37 |     def create_map(self):
 38 |         valid_map=False
 39 |         while valid_map==False:
 40 |             # Agent
 41 |             self._pos_agent=[1,1]
 42 | 
 43 |             # Walls
 44 |             self._pos_walls=[]
 45 |             for i in range(self._size_maze):
 46 |                 self._pos_walls.append([i,0])
 47 |                 self._pos_walls.append([i,self._size_maze-1])
 48 |             for j in range(self._size_maze-2):
 49 |                 self._pos_walls.append([0,j+1])
 50 |                 self._pos_walls.append([self._size_maze-1,j+1])
 51 |             
 52 |             n=0
 53 |             while n < self._n_walls:
 54 |                 potential_wall=[self._random_state.randint(1,self._size_maze-2),self._random_state.randint(1,self._size_maze-2)]
 55 |                 if(potential_wall not in self._pos_walls and potential_wall!=self._pos_agent):
 56 |                     self._pos_walls.append(potential_wall)
 57 |                     n+=1
 58 |             
 59 |             # Rewards
 60 |             #self._pos_rewards=[[self._size_maze-2,self._size_maze-2]]
 61 |             self._pos_rewards=[]
 62 |             n=0
 63 |             while n < self._n_rewards:
 64 |                 potential_reward=[self._random_state.randint(1,self._size_maze-1),self._random_state.randint(1,self._size_maze-1)]
 65 |                 if(potential_reward not in self._pos_rewards and potential_reward not in self._pos_walls and potential_reward!=self._pos_agent):
 66 |                     self._pos_rewards.append(potential_reward)
 67 |                     n+=1
 68 |             
 69 |             valid_map=self.is_valid_map(self._pos_agent,self._pos_walls,self._pos_rewards)
 70 | 
 71 | 
 72 |     def is_valid_map(self,pos_agent,pos_walls,pos_rewards):
 73 |         a = pf.AStar()
 74 |         pos_walls
 75 |         walls = [tuple(w) for w in pos_walls]
 76 |         start=tuple(pos_agent)
 77 |         for r in pos_rewards:
 78 |             end=tuple(r)
 79 |             a.init_grid(self._size_maze, self._size_maze, walls, start, end)
 80 |             maze=a
 81 |             optimal_path=maze.solve()
 82 |             if(optimal_path==None):
 83 |                 return False
 84 |         
 85 |         return True
 86 | 
 87 |     def reset(self, mode):
 88 |         self._episode_steps = 0
 89 |         self._mode=mode
 90 |         self.create_map()
 91 |         
 92 |         if mode == MyEnv.VALIDATION_MODE:
 93 |             if self._mode != MyEnv.VALIDATION_MODE:
 94 |                 self._mode = MyEnv.VALIDATION_MODE
 95 |                 self._mode_score = 0.0
 96 |                 self._mode_episode_count = 0
 97 |                 
 98 |             else:
 99 |                 self._mode_episode_count += 1
100 |                 
101 |         return [1 * [self._size_maze * [self._size_maze * [0]]]]
102 |         
103 |         
104 |     def act(self, action):
105 |         self._episode_steps += 1
106 |         action = self._actions[action]
107 | 
108 |         reward = -0.1
109 | 
110 |         if(action==0):
111 |             if([self._pos_agent[0]+1,self._pos_agent[1]] not in self._pos_walls):
112 |                 self._pos_agent[0]=self._pos_agent[0]+1
113 |         elif(action==1):        
114 |             if([self._pos_agent[0],self._pos_agent[1]+1] not in self._pos_walls):
115 |                 self._pos_agent[1]=self._pos_agent[1]+1
116 |         elif(action==2):        
117 |             if([self._pos_agent[0]-1,self._pos_agent[1]] not in self._pos_walls):
118 |                 self._pos_agent[0]=self._pos_agent[0]-1
119 |         elif(action==3):        
120 |             if([self._pos_agent[0],self._pos_agent[1]-1] not in self._pos_walls):
121 |                 self._pos_agent[1]=self._pos_agent[1]-1
122 |         
123 |         if (self._pos_agent in self._pos_rewards):
124 |             reward = 1
125 |             self._pos_rewards.remove(self._pos_agent)
126 | 
127 |         self._mode_score += reward
128 |         return reward
129 | 
130 | 
131 |     def summarizePerformance(self, test_data_set, learning_algo, *args, **kwargs):
132 |         print ("test_data_set.observations.shape")
133 |         print (test_data_set.observations()[0][0:1])
134 |         
135 |         print ("self._mode_score:"+str(self._mode_score)+".")
136 |         
137 | 
138 |     def inputDimensions(self):
139 |         if(self._higher_dim_obs==True):
140 |             return [(1,self._size_maze*6,self._size_maze*6)]
141 |         else:
142 |             return [(1,self._size_maze,self._size_maze)]
143 |         
144 |     def observationType(self, subject):
145 |         return np.float32
146 | 
147 |     def nActions(self):
148 |         return len(self._actions)
149 | 
150 |     def observe(self):
151 |         self._map=np.zeros((self._size_maze,self._size_maze))
152 |         for coord_wall in self._pos_walls:
153 |             self._map[coord_wall[0],coord_wall[1]]=1
154 |         for coord_reward in self._pos_rewards:
155 |             self._map[coord_reward[0],coord_reward[1]]=2
156 |         self._map[self._pos_agent[0],self._pos_agent[1]]=0.5
157 | 
158 |         if(self._higher_dim_obs==True):
159 |             indices_reward=np.argwhere(self._map == 2)
160 |             indices_agent=np.argwhere(self._map == 0.5)
161 |             self._map=self._map/1.
162 |             self._map=np.repeat(np.repeat(self._map, 6, axis=0),6, axis=1)
163 |             # agent repr
164 |             agent_obs=np.zeros((6,6))
165 |             agent_obs[0,2]=0.8
166 |             agent_obs[1,0:5]=0.9
167 |             agent_obs[2,1:4]=0.9
168 |             agent_obs[3,1:4]=0.9
169 |             agent_obs[4,1]=0.9
170 |             agent_obs[4,3]=0.9
171 |             agent_obs[5,0:2]=0.9
172 |             agent_obs[5,3:5]=0.9
173 |             
174 |             # reward repr
175 |             reward_obs=np.zeros((6,6))
176 |             reward_obs[:,1]=0.7
177 |             reward_obs[0,1:4]=0.6
178 |             reward_obs[1,3]=0.7
179 |             reward_obs[2,1:4]=0.6
180 |             reward_obs[4,2]=0.7
181 |             reward_obs[5,2:4]=0.7
182 |             
183 |             for i in indices_reward:
184 |                 self._map[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=reward_obs
185 | 
186 |             for i in indices_agent:
187 |                 self._map[i[0]*6:(i[0]+1)*6:,i[1]*6:(i[1]+1)*6]=agent_obs
188 |             self._map=(self._map*2)-1 #scaling
189 |             #print ("self._map higher_dim_obs")
190 |             #print (self._map)
191 |             #plt.imshow(self._map, cmap='gray_r')
192 |             #plt.show()
193 |         else:
194 |             self._map=self._map/2.
195 |             self._map[self._map == 0.5] = 0.99  # agent
196 |             self._map[self._map == 1.] = 0.5    # reward
197 |             
198 |         if(self._reverse==True):
199 |             self._map=-self._map #1-self._map
200 |         
201 |         return [self._map]
202 | 
203 |     def inTerminalState(self):
204 |         if ( self._pos_rewards==[] or (self._mode>=0 and self._episode_steps >= 50) ):
205 |             return True
206 |         else:
207 |             return False
208 | 
209 | 
210 | 
211 | if __name__ == "__main__":
212 |     import hashlib
213 |     
214 |     rng = np.random.RandomState(123456)
215 |     env = MyEnv(rng, higher_dim_obs=False)
216 |     
217 |     maps=[]
218 |     for i in range(10000):
219 |         env.create_map()
220 |         
221 |         one_laby=env.observe()[0]
222 |         
223 |         # Hashing the labyrinths to be able to find duplicates in O(1)
224 |         one_laby=int(hashlib.sha1(str(one_laby).encode('utf-8')).hexdigest(), 16) % (10 ** 8)
225 |         
226 |         # TESTING ADDING DUPLICATION
227 |         if i%1000==0:
228 |             env.reset(0)
229 |         if i%1000==500:
230 |             env.reset(1)
231 | 
232 |         maps.append(copy.deepcopy(one_laby))
233 | 
234 |     duplicate_laby=0
235 |     for i in range(10000):
236 |         env.create_map()
237 |         one_laby=env.observe()[0]
238 |         
239 |         # Hashing the labyrinths to be able to find duplicates in O(1)
240 |         one_laby=int(hashlib.sha1(str(one_laby).encode('utf-8')).hexdigest(), 16) % (10 ** 8)
241 |         
242 |         # TESTING ADDING DUPLICATION
243 |         #if i%1000==0:
244 |         #    maps.append(one_laby)
245 | 
246 |         # TESTING WITH RESETS
247 |         if i%1000==0:
248 |             env.reset(0)
249 |         if i%1000==500:
250 |             env.reset(1)
251 | 
252 |         duplicate=min(maps.count(one_laby),1)
253 |         duplicate_laby+=duplicate
254 |         
255 |         if i%1000==0:
256 |             print ("Number of duplicate labyrinths:"+str(duplicate_laby)+".")
257 |     
258 |     
259 |     
260 | 


--------------------------------------------------------------------------------
/examples/test_CRAR/run_simple_maze.py:
--------------------------------------------------------------------------------
  1 | """Simple maze launcher
  2 | 
  3 | """
  4 | 
  5 | import sys
  6 | import logging
  7 | import numpy as np
  8 | from joblib import hash, dump, load
  9 | import os
 10 | 
 11 | from deer.default_parser import process_args
 12 | from deer.agent import NeuralAgent
 13 | from deer.learning_algos.CRAR_keras import CRAR
 14 | from simple_maze_env import MyEnv as simple_maze_env
 15 | import deer.experiment.base_controllers as bc
 16 | 
 17 | from deer.policies import EpsilonGreedyPolicy
 18 | 
 19 | 
 20 | class Defaults:
 21 |     # ----------------------
 22 |     # Experiment Parameters
 23 |     # ----------------------
 24 |     STEPS_PER_EPOCH = 5000
 25 |     EPOCHS = 50
 26 |     STEPS_PER_TEST = 1000
 27 |     PERIOD_BTW_SUMMARY_PERFS = 1
 28 |     
 29 |     # ----------------------
 30 |     # Environment Parameters
 31 |     # ----------------------
 32 |     FRAME_SKIP = 2
 33 | 
 34 |     # ----------------------
 35 |     # DQN Agent parameters:
 36 |     # ----------------------
 37 |     UPDATE_RULE = 'rmsprop'
 38 |     LEARNING_RATE = 0.0005
 39 |     LEARNING_RATE_DECAY = 0.9
 40 |     DISCOUNT = 0.9
 41 |     DISCOUNT_INC = 1
 42 |     DISCOUNT_MAX = 0.99
 43 |     RMS_DECAY = 0.9
 44 |     RMS_EPSILON = 0.0001
 45 |     MOMENTUM = 0
 46 |     CLIP_NORM = 1.0
 47 |     EPSILON_START = 1.0
 48 |     EPSILON_MIN = 1.0
 49 |     EPSILON_DECAY = 10000
 50 |     UPDATE_FREQUENCY = 1
 51 |     REPLAY_MEMORY_SIZE = 1000000 #replacing with 200000 will works just fine (in case you dont have 18gb of memory)
 52 |     BATCH_SIZE = 32
 53 |     FREEZE_INTERVAL = 1000
 54 |     DETERMINISTIC = False
 55 | 
 56 | 
 57 | HIGHER_DIM_OBS = True
 58 | 
 59 | if __name__ == "__main__":
 60 |     logging.basicConfig(level=logging.INFO)
 61 |     
 62 |     # --- Parse parameters ---
 63 |     parameters = process_args(sys.argv[1:], Defaults)
 64 |     if parameters.deterministic:
 65 |         rng = np.random.RandomState(123456)
 66 |     else:
 67 |         rng = np.random.RandomState()
 68 |     
 69 |     # --- Instantiate environment ---
 70 |     env = simple_maze_env(rng, higher_dim_obs=HIGHER_DIM_OBS)
 71 |     
 72 |     # --- Instantiate learning_algo ---
 73 |     learning_algo = CRAR(
 74 |         env,
 75 |         parameters.rms_decay,
 76 |         parameters.rms_epsilon,
 77 |         parameters.momentum,
 78 |         parameters.clip_norm,
 79 |         parameters.freeze_interval,
 80 |         parameters.batch_size,
 81 |         parameters.update_rule,
 82 |         rng,
 83 |         high_int_dim=False,
 84 |         internal_dim=2)
 85 |     
 86 |     test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.)
 87 | 
 88 |     # --- Instantiate agent ---
 89 |     agent = NeuralAgent(
 90 |         env,
 91 |         learning_algo,
 92 |         parameters.replay_memory_size,
 93 |         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
 94 |         parameters.batch_size,
 95 |         rng,
 96 |         test_policy=test_policy)
 97 | 
 98 |     # --- Create unique filename for FindBestController ---
 99 |     h = hash(vars(parameters), hash_name="sha1")
100 |     fname = "test_" + h
101 |     print("The parameters hash is: {}".format(h))
102 |     print("The parameters are: {}".format(parameters))
103 | 
104 |     # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
105 |     # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
106 |     # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
107 |     # episode or epoch (or never, hence the resetEvery='none').
108 |     agent.attach(bc.EpsilonController(
109 |         initial_e=parameters.epsilon_start,
110 |         e_decays=parameters.epsilon_decay,
111 |         e_min=parameters.epsilon_min,
112 |         evaluate_on='action',
113 |         periodicity=1,
114 |         reset_every='none'))
115 | 
116 |     agent.run(10, 500)
117 |     print("end gathering data")
118 | 
119 |     # --- Bind controllers to the agent ---
120 |     # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
121 |     # learning rate as well as the training epoch number.
122 |     agent.attach(bc.VerboseController(
123 |         evaluate_on='epoch', 
124 |         periodicity=1))
125 |     
126 |     # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
127 |     # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
128 |     agent.attach(bc.LearningRateController(
129 |         initial_learning_rate=parameters.learning_rate, 
130 |         learning_rate_decay=parameters.learning_rate_decay,
131 |         periodicity=1))
132 |     
133 |     # Same for the discount factor.
134 |     agent.attach(bc.DiscountFactorController(
135 |         initial_discount_factor=parameters.discount, 
136 |         discount_factor_growth=parameters.discount_inc, 
137 |         discount_factor_max=parameters.discount_max,
138 |         periodicity=1))
139 |         
140 |     # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
141 |     # Plus, we also want to display after each training episode (!= than after every training) the average bellman
142 |     # residual and the average of the V values obtained during the last episode, hence the two last arguments.
143 |     agent.attach(bc.TrainerController(
144 |         evaluate_on='action', 
145 |         periodicity=parameters.update_frequency, 
146 |         show_episode_avg_V_value=True, 
147 |         show_avg_Bellman_residual=True))
148 |     
149 |     # We wish to discover, among all versions of our neural network (i.e., after every training epoch), which one 
150 |     # has the highest validation score.
151 |     # To achieve this goal, one can use the FindBestController along with an InterleavedTestEpochControllers. It is 
152 |     # important that the validationID is the same than the id argument of the InterleavedTestEpochController.
153 |     # The FindBestController will dump on disk the validation scores for each and every network, as well as the 
154 |     # structure of the neural network having the best validation score. These dumps can then used to plot the evolution 
155 |     # of the validation and test scores (see below) or simply recover the resulting neural network for your 
156 |     # application.
157 |     agent.attach(bc.FindBestController(
158 |         validationID=simple_maze_env.VALIDATION_MODE,
159 |         testID=None,
160 |         unique_fname=fname))
161 |     
162 |     # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a 
163 |     # "validation epoch" between each training epoch. For each validation epoch, we want also to display the sum of all 
164 |     # rewards obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env 
165 |     # every [parameters.period_btw_summary_perfs] *validation* epochs.
166 |     agent.attach(bc.InterleavedTestEpochController(
167 |         id=simple_maze_env.VALIDATION_MODE, 
168 |         epoch_length=parameters.steps_per_test,
169 |         periodicity=1,
170 |         show_score=True,
171 |         summarize_every=1))
172 |     
173 |     # --- Run the experiment ---
174 |     try:
175 |         os.mkdir("params")
176 |     except Exception:
177 |         pass
178 |     dump(vars(parameters), "params/" + fname + ".jldump")
179 |     agent.gathering_data=False
180 |     agent.run(parameters.epochs, parameters.steps_per_epoch)
181 |     
182 |     # --- Show results ---
183 |     basename = "scores/" + fname
184 |     scores = load(basename + "_scores.jldump")
185 |     print (scores)
186 | #    plt.plot(range(1, len(scores['vs'])+1), scores['vs'], label="VS", color='b')
187 | #    plt.legend()
188 | #    plt.xlabel("Number of epochs")
189 | #    plt.ylabel("Score")
190 | #    plt.savefig(basename + "_scores.pdf")
191 | #    plt.show()
192 | 


--------------------------------------------------------------------------------
/examples/toy_env/Toy_env.py:
--------------------------------------------------------------------------------
  1 | """ 
  2 | The environment simulates the possibility of buying or selling a good. The agent can either have one unit or zero unit of that good. At each transaction with the market, the agent obtains a reward equivalent to the price of the good when selling it and the opposite when buying. In addition, a penalty of 0.5 (negative reward) is added for each transaction.
  3 | Two actions are possible for the agent:
  4 | - Action 0 corresponds to selling if the agent possesses one unit or idle if the agent possesses zero unit.
  5 | - Action 1 corresponds to buying if the agent possesses zero unit or idle if the agent already possesses one unit.
  6 | The state of the agent is made up of an history of two punctual observations:
  7 | - The price signal
  8 | - Either the agent possesses the good or not (1 or 0)
  9 | The price signal is build following the same rules for the training and the validation environment. That allows the agent to learn a strategy that exploits this successfully.
 10 | 
 11 | """
 12 | 
 13 | import numpy as np
 14 | from mpl_toolkits.axes_grid1 import host_subplot
 15 | import mpl_toolkits.axisartist as AA
 16 | import matplotlib.pyplot as plt
 17 | 
 18 | from deer.base_classes import Environment
 19 | 
 20 | class MyEnv(Environment):
 21 |     
 22 |     def __init__(self, rng):
 23 |         """ Initialize environment.
 24 | 
 25 |         Parameters
 26 |         -----------
 27 |         rng : the numpy random number generator
 28 |         """
 29 |         # Defining the type of environment
 30 |         self._last_ponctual_observation = [0, 0] # At each time step, the observation is made up of two elements, each scalar
 31 |         
 32 |         self._random_state = rng
 33 |                 
 34 |         # Building a price signal with some patterns
 35 |         self._price_signal=[]
 36 |         for i in range (1000):
 37 |             price = np.array([0.,0.,0.,-1.,0.,1.,0., 0., 0.])
 38 |             price += self._random_state.uniform(0, 3)
 39 |             self._price_signal.extend(price.tolist())
 40 |        
 41 |         self._price_signal_train = self._price_signal[:len(self._price_signal)//2]
 42 |         self._price_signal_valid = self._price_signal[len(self._price_signal)//2:]
 43 |         self._prices = None
 44 |         self._counter = 1
 45 |                 
 46 |     def reset(self, mode):
 47 |         """ Resets the environment for a new episode.
 48 | 
 49 |         Parameters
 50 |         -----------
 51 |         mode : int
 52 |             -1 is for the training phase, others are for validation/test.
 53 | 
 54 |         Returns
 55 |         -------
 56 |         list
 57 |             Initialization of the sequence of observations used for the pseudo-state; dimension must match self.inputDimensions().
 58 |             If only the current observation is used as a (pseudo-)state, then this list is equal to self._last_ponctual_observation.
 59 |         """
 60 |         if mode == -1:
 61 |             self.prices = self._price_signal_train
 62 |         else:
 63 |             self.prices = self._price_signal_valid
 64 |             
 65 |         
 66 |         self._last_ponctual_observation = [self.prices[0], 0]
 67 | 
 68 |         self._counter = 1
 69 |         return [6*[0], 0]
 70 | 
 71 |     def act(self, action):
 72 |         """ Performs one time-step within the environment and updates the current observation self._last_ponctual_observation
 73 | 
 74 |         Parameters
 75 |         -----------
 76 |         action : int
 77 |             Integer in [0, ..., N_A] where N_A is the number of actions given by self.nActions()
 78 | 
 79 |         Returns
 80 |         -------
 81 |         reward: float
 82 |         """
 83 |         reward = 0
 84 |         
 85 |         if (action == 0 and self._last_ponctual_observation[1] == 1):
 86 |             reward = self.prices[self._counter-1] - 0.5
 87 |         if (action == 1 and self._last_ponctual_observation[1] == 0):
 88 |             reward = -self.prices[self._counter-1] - 0.5
 89 | 
 90 |         self._last_ponctual_observation[0] = self.prices[self._counter]
 91 |         self._last_ponctual_observation[1] = action
 92 | 
 93 |         self._counter += 1
 94 |         
 95 |         return reward
 96 | 
 97 |     def summarizePerformance(self, test_data_set, *args, **kwargs):
 98 |         """
 99 |         This function is called at every PERIOD_BTW_SUMMARY_PERFS.
100 |         Parameters
101 |         -----------
102 |             test_data_set
103 |         """
104 |     
105 |         print ("Summary Perf")
106 |         
107 |         observations = test_data_set.observations()
108 |         prices = observations[0][100:200]
109 |         invest = observations[1][100:200]
110 |         
111 |         steps=np.arange(len(prices))
112 |         steps_long=np.arange(len(prices)*10)/10.
113 |         
114 |         #print steps,invest,prices
115 |         host = host_subplot(111, axes_class=AA.Axes)
116 |         plt.subplots_adjust(right=0.9, left=0.1)
117 |     
118 |         par1 = host.twinx()
119 |     
120 |         host.set_xlabel("Time")
121 |         host.set_ylabel("Price")
122 |         par1.set_ylabel("Investment")
123 |     
124 |         p1, = host.plot(steps_long, np.repeat(prices,10), lw=3, c = 'b', alpha=0.8, ls='-', label = 'Price')
125 |         p2, = par1.plot(steps, invest, marker='o', lw=3, c = 'g', alpha=0.5, ls='-', label = 'Investment')
126 |     
127 |         par1.set_ylim(-0.09, 1.09)
128 |     
129 |     
130 |         host.axis["left"].label.set_color(p1.get_color())
131 |         par1.axis["right"].label.set_color(p2.get_color())
132 |     
133 |         plt.savefig("plot.png")
134 |         plt.close()
135 |         
136 |         print ("A plot of the policy obtained has been saved under the name plot.png")
137 |     
138 |     def inputDimensions(self):
139 |         return [(6,), (1,)]     # We consider an observation made up of an history of 
140 |                                 # - the last six for the first scalar element obtained
141 |                                 # - the last one for the second scalar element
142 | 
143 | 
144 |     def nActions(self):
145 |         return 2                # The environment allows two different actions to be taken at each time step
146 | 
147 | 
148 |     def inTerminalState(self):
149 |         return False
150 | 
151 |     def observe(self):
152 |         return np.array(self._last_ponctual_observation)
153 | 
154 |                 
155 | 
156 | 
157 | def main():
158 |     # Can be used for debug purposes
159 |     rng = np.random.RandomState(123456)
160 |     myenv = MyEnv(rng)
161 | 
162 |     print (myenv.observe())
163 |     
164 | if __name__ == "__main__":
165 |     main()
166 | 


--------------------------------------------------------------------------------
/examples/toy_env/run_toy_env.py:
--------------------------------------------------------------------------------
  1 | """Toy environment launcher. See the docs for more details about this environment.
  2 | 
  3 | """
  4 | 
  5 | import sys
  6 | import logging
  7 | import numpy as np
  8 | 
  9 | from deer.default_parser import process_args
 10 | from deer.agent import NeuralAgent
 11 | from deer.learning_algos.q_net_keras import MyQNetwork
 12 | from Toy_env import MyEnv as Toy_env
 13 | import deer.experiment.base_controllers as bc
 14 | from deer.policies import EpsilonGreedyPolicy
 15 | 
 16 | 
 17 | class Defaults:
 18 |     # ----------------------
 19 |     # Experiment Parameters
 20 |     # ----------------------
 21 |     STEPS_PER_EPOCH = 1000
 22 |     EPOCHS = 50
 23 |     STEPS_PER_TEST = 500
 24 |     PERIOD_BTW_SUMMARY_PERFS = 1
 25 | 
 26 |     # ----------------------
 27 |     # Environment Parameters
 28 |     # ----------------------
 29 |     FRAME_SKIP = 1
 30 | 
 31 |     # ----------------------
 32 |     # DQN Agent parameters:
 33 |     # ----------------------
 34 |     UPDATE_RULE = 'rmsprop'
 35 |     LEARNING_RATE = 0.005
 36 |     LEARNING_RATE_DECAY = 1.
 37 |     DISCOUNT = 0.9
 38 |     DISCOUNT_INC = 1.
 39 |     DISCOUNT_MAX = 0.99
 40 |     RMS_DECAY = 0.9
 41 |     RMS_EPSILON = 0.0001
 42 |     MOMENTUM = 0
 43 |     CLIP_NORM = 1.0
 44 |     EPSILON_START = 1.0
 45 |     EPSILON_MIN = .1
 46 |     EPSILON_DECAY = 10000
 47 |     UPDATE_FREQUENCY = 1
 48 |     REPLAY_MEMORY_SIZE = 1000000
 49 |     BATCH_SIZE = 32
 50 |     FREEZE_INTERVAL = 1000
 51 |     DETERMINISTIC = True
 52 | 
 53 | 
 54 | if __name__ == "__main__":
 55 |     logging.basicConfig(level=logging.INFO)
 56 |     
 57 |     # --- Parse parameters ---
 58 |     parameters = process_args(sys.argv[1:], Defaults)
 59 |     if parameters.deterministic:
 60 |         rng = np.random.RandomState(123456)
 61 |     else:
 62 |         rng = np.random.RandomState()
 63 |     
 64 |     # --- Instantiate environment ---
 65 |     env = Toy_env(rng)
 66 | 
 67 |     # --- Instantiate qnetwork ---
 68 |     qnetwork = MyQNetwork(
 69 |         env,
 70 |         parameters.rms_decay,
 71 |         parameters.rms_epsilon,
 72 |         parameters.momentum,
 73 |         parameters.clip_norm,
 74 |         parameters.freeze_interval,
 75 |         parameters.batch_size,
 76 |         parameters.update_rule,
 77 |         rng)
 78 |     
 79 |     train_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.1)
 80 |     test_policy = EpsilonGreedyPolicy(qnetwork, env.nActions(), rng, 0.)
 81 | 
 82 |     # --- Instantiate agent ---
 83 |     agent = NeuralAgent(
 84 |         env,
 85 |         qnetwork,
 86 |         parameters.replay_memory_size,
 87 |         max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))),
 88 |         parameters.batch_size,
 89 |         rng, 
 90 |         train_policy=train_policy,
 91 |         test_policy=test_policy)
 92 | 
 93 |     # --- Bind controllers to the agent ---
 94 |     # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and 
 95 |     # learning rate as well as the training epoch number.
 96 |     agent.attach(bc.VerboseController(
 97 |         evaluate_on='epoch', 
 98 |         periodicity=1))
 99 | 
100 |     # During training epochs, we want to train the agent after every [parameters.update_frequency] action it takes.
101 |     # Plus, we also want to display after each training episode (!= than after every training) the average bellman
102 |     # residual and the average of the V values obtained during the last episode, hence the two last arguments.
103 |     agent.attach(bc.TrainerController(
104 |         evaluate_on='action', 
105 |         periodicity=parameters.update_frequency, 
106 |         show_episode_avg_V_value=True, 
107 |         show_avg_Bellman_residual=True))
108 | 
109 |     # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we 
110 |     # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
111 |     agent.attach(bc.LearningRateController(
112 |         initial_learning_rate=parameters.learning_rate,
113 |         learning_rate_decay=parameters.learning_rate_decay,
114 |         periodicity=1))
115 | 
116 |     # Same for the discount factor.
117 |     agent.attach(bc.DiscountFactorController(
118 |         initial_discount_factor=parameters.discount,
119 |         discount_factor_growth=parameters.discount_inc,
120 |         discount_factor_max=parameters.discount_max,
121 |         periodicity=1))
122 | 
123 |     # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy
124 |     # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more
125 |     # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every
126 |     # episode or epoch (or never, hence the resetEvery='none').
127 |     agent.attach(bc.EpsilonController(
128 |         initial_e=parameters.epsilon_start, 
129 |         e_decays=parameters.epsilon_decay, 
130 |         e_min=parameters.epsilon_min,
131 |         evaluate_on='action', 
132 |         periodicity=1, 
133 |         reset_every='none'))
134 | 
135 |     # We also want to interleave a "test epoch" between each training epoch. 
136 |     # For each test epoch, we want also to display the sum of all rewards obtained, hence the showScore=True.
137 |     # Finally, we want to call the summarizePerformance method of Toy_Env every [parameters.period_btw_summary_perfs]
138 |     # *test* epochs.
139 |     agent.attach(bc.InterleavedTestEpochController(
140 |         id=0, 
141 |         epoch_length=parameters.steps_per_test, 
142 |         periodicity=1, 
143 |         show_score=True,
144 |         summarize_every=parameters.period_btw_summary_perfs))
145 |         
146 |     print ("Starting the run of the agent for "+str(parameters.epochs)+" epochs, with "+str(parameters.steps_per_epoch)+" steps per epoch")
147 |     # --- Run the experiment ---
148 |     agent.run(parameters.epochs, parameters.steps_per_epoch)
149 | 


--------------------------------------------------------------------------------
/examples/toy_env/run_toy_env_simple.py:
--------------------------------------------------------------------------------
 1 | """Toy environment launcher. See the docs for more details about this environment.
 2 | 
 3 | """
 4 | 
 5 | import numpy as np
 6 | 
 7 | from deer.agent import NeuralAgent
 8 | from deer.learning_algos.q_net_keras import MyQNetwork
 9 | from Toy_env import MyEnv as Toy_env
10 | import deer.experiment.base_controllers as bc
11 | 
12 | 
13 | rng = np.random.RandomState(123456)
14 | 
15 | # --- Instantiate environment ---
16 | env = Toy_env(rng)
17 | 
18 | # --- Instantiate qnetwork ---
19 | qnetwork = MyQNetwork(
20 |     environment=env,
21 |     random_state=rng)
22 | 
23 | # --- Instantiate agent ---
24 | agent = NeuralAgent(
25 |     env,
26 |     qnetwork,
27 |     random_state=rng)
28 | 
29 | # --- Bind controllers to the agent ---
30 | # Before every training epoch, we want to print a summary of the agent's epsilon, discount and 
31 | # learning rate as well as the training epoch number.
32 | agent.attach(bc.VerboseController())
33 | 
34 | # During training epochs, we want to train the agent after every action it takes.
35 | # Plus, we also want to display after each training episode (!= than after every training) the average bellman
36 | # residual and the average of the V values obtained during the last episode.
37 | agent.attach(bc.TrainerController())
38 | 
39 | # We also want to interleave a "test epoch" between each training epoch. 
40 | agent.attach(bc.InterleavedTestEpochController(epoch_length=500))
41 |     
42 | # --- Run the experiment ---
43 | agent.run(n_epochs=100, epoch_length=1000)
44 | 


--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/conf.py
11 | 
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | #  configuration: mkdocs.yml
15 | 
16 | # Optionally build your docs in additional formats such as PDF
17 | #formats:
18 | #  - pdf
19 | 
20 | # Optionally set the version of Python and requirements required to build your docs
21 | #python:
22 | #  version: 3.7
23 | #  install:
24 | #    - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | sphinx
3 | numpydoc


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.19
2 | joblib>=0.16
3 | matplotlib>=3.3.2
4 | tensorflow>=2.6
5 | keras>=2.6


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | import deer
 4 | 
 5 | NAME = 'deer'
 6 | VERSION = '0.4.3'
 7 | AUTHOR = "Vincent Francois-Lavet"
 8 | AUTHOR_EMAIL = "vincent.francois@gmail.com"
 9 | URL = 'https://github.com/VinF/deer'
10 | DESCRIPTION = 'Framework for deep reinforcement learning'
11 | with open('README.rst') as f:
12 |     LONG_DESCRIPTION = f.read()
13 | CLASSIFIERS = [
14 |     'Development Status :: 3 - Alpha',
15 |     'Environment :: Console',
16 |     'Intended Audience :: Developers',
17 |     'Intended Audience :: Science/Research',
18 |     'Intended Audience :: Education',
19 |     'License :: OSI Approved :: BSD License',
20 |     'Operating System :: OS Independent',
21 |     'Programming Language :: Python :: 2.7',
22 |     'Programming Language :: Python :: 3',
23 |     'Programming Language :: Python :: 3.6',
24 |     'Programming Language :: Python :: 3.7',
25 |     'Topic :: Scientific/Engineering',
26 |     'Topic :: Utilities',
27 |     'Topic :: Software Development :: Libraries',
28 | ]
29 | 
30 | if __name__ == '__main__':
31 |     setup(name=NAME,
32 |           version=VERSION,
33 |           author=AUTHOR,
34 |           author_email=AUTHOR_EMAIL,
35 |           url=URL,
36 |           description=DESCRIPTION,
37 |           long_description=LONG_DESCRIPTION,
38 |           license='BSD',
39 |           classifiers=CLASSIFIERS,
40 |           platforms='any',
41 |           packages=find_packages())


--------------------------------------------------------------------------------