├── .gitignore
├── LICENSE
├── README.md
├── assets
    └── images
    │   ├── 2023-01-24-11-04-11.png
    │   ├── 2023-01-24-11-04-28.png
    │   ├── acc_vs_steps_alpha.png
    │   ├── accuracy_vs_steps_eps.png
    │   ├── blackjack_rms_vs_ep.png
    │   ├── blackjack_v.png
    │   ├── expected_r_vs_steps.png
    │   ├── expected_r_vs_steps_alpha.png
    │   ├── infinite_variance.png
    │   ├── maximization_bias.png
    │   ├── mc_vs_td.png
    │   ├── mountain_car_n1_vs_n8.png
    │   ├── n_effect.png
    │   ├── ordinary_vs_weighted.png
    │   ├── q_vs_sarsa.png
    │   ├── single_state.png
    │   ├── steps_per_episode_vs_episode.png
    │   ├── ucb_expected_reward_vs_steps.png
    │   ├── ucb_steps_vs_acc.png
    │   └── uct.png
├── docs
    ├── MDP.md
    └── MODELFREE.md
├── examples
    ├── blackjack.py
    ├── dyna_maze.py
    ├── gridworld.py
    ├── mcts.py
    ├── mountain_car.py
    ├── random_walk.py
    ├── short_corridor.py
    ├── single_state.py
    ├── state_aggregation.py
    └── windy_gridworld.py
├── requirements.txt
├── rl
    ├── __init__.py
    ├── approximators.py
    ├── armed_bandits.py
    ├── mdp.py
    ├── model_free.py
    ├── solvers
    │   ├── __init__.py
    │   ├── approx.py
    │   ├── model_based.py
    │   ├── model_free.py
    │   └── planning.py
    ├── tiles.py
    └── utils.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 ivan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Reinforcement Learning
  2 | 
  3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/YannDubs/disentangling-vae/blob/master/LICENSE) 
  4 | [![Python 3.8](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/release/python-360/)
  5 | 
  6 | ## Installation 
  7 | 
  8 | ### setup.py
  9 | ```sh
 10 | $ python setup.py install
 11 | ```
 12 | 
 13 | # Overview
 14 | 
 15 | This repository contains code that implements algorithms and models from Sutton's book on reinforcement learning. The book, titled "Reinforcement Learning: An Introduction," is a classic text on the subject and provides a comprehensive introduction to the field.
 16 | 
 17 | The code in this repository is organized into several modules, each of which covers differents topics.
 18 | 
 19 | 
 20 | # Methods
 21 | 
 22 | - [x] Multi Armed Bandits
 23 |   - [x] Epsilon Greedy
 24 |   - [x] Optimistic Initial Values
 25 |   - [x] Gradient 
 26 |   - [x] α (non stationary)
 27 | - [x] Model Based
 28 |   - [x] Policy Evaluation
 29 |   - [x] Policy Iteration
 30 |   - [x] Value Iteration
 31 | - [x] Monte Carlo estimation and control
 32 |   - [x] First-visit α-MC
 33 |   - [x] Every-visit α-MC  
 34 |   - [x] MC with Exploring Starts
 35 |   - [x] Off-policy MC, ordinary and weighted importance sampling   
 36 | - [x] Temporal Difference
 37 |   - [x] TD(n) estimation 
 38 |   - [x] n-step SARSA 
 39 |   - [x] n-step Q-learning
 40 |   - [x] n-step Expected SARSA
 41 |   - [x] double Q learning
 42 |   - [x] n-step Tree Backup 
 43 | - [x] Planning
 44 |   - [x] Dyna-Q/Dyna-Q+
 45 |   - [x] Prioritized Sweeping
 46 |   - [x] Trajectory Sampling
 47 |   - [x] MCTS
 48 | - [ ] On-policy Prediction 
 49 |   - [x] Gradient MC
 50 |   - [x] $n$-step semi-gradient TD
 51 |   - [ ] ANN
 52 |   - [ ] Least-Squares TD
 53 |   - [ ] Kernel-based
 54 | - [x] On-policy Control 
 55 |   - [x] Episodic semi-gradient
 56 |   - [x] Semi-gradient n-step Sarsa
 57 |   - [x] Differential Semi-gradient n-step Sarsa
 58 | - [ ] Elegibility Traces
 59 |   - [x] TD($\lambda$)
 60 |   - [ ] True Online
 61 |   - [x] Sarsa($\lambda$) 
 62 |   - [ ] True Online Sarsa($\lambda$)
 63 | - [ ] Policy Gradient
 64 |   - [x] REINFORCE: Monte Carlo Policy Gradient w/wo Baseline
 65 |   - [ ] Actor-Critic (episodic) w/wo eligibility traces
 66 |   - [ ] Actor-Critic (continuing) with eligibility traces
 67 | <br>
 68 | 
 69 | All model free solvers will work just by defining `states` `actions` and a `trasition` function. Transitions are defined as a function that takes a state and an action and returns a tuple of the next state and the reward. The transition function also returns a boolean indicating whether the episode has terminated.
 70 | 
 71 | ```python
 72 | states: Sequence[Any]
 73 | actions: Sequence[Any]
 74 | transtion: Callable[[Any, Any], Tuple[Tuple[Any, float], bool]]
 75 | ```
 76 | 
 77 | # Examples 
 78 | 
 79 | **Single State Infinite Variance Example 5.5**
 80 | 
 81 | ![](https://github.com/ivanbelenky/RL/blob/master/assets/images/single_state.png)
 82 | 
 83 | 
 84 | ```python
 85 | from mypyrl import off_policy_mc, ModelFreePolicy
 86 | 
 87 | states = [0]
 88 | actions = ['left', 'right']
 89 | 
 90 | def single_state_transition(state, action):
 91 |     if action == 'right':
 92 |         return (state, 0), True
 93 |     if action == 'left':
 94 |         threshold = np.random.random()
 95 |         if threshold > 0.9:
 96 |             return (state, 1), True
 97 |         else:
 98 |             return (state, 0), False
 99 | 
100 | b = ModelFreePolicy(actions, states) #by default equiprobable
101 | pi = ModelFreePolicy(actions, states)
102 | pi.pi[0] = np.array([1, 0])
103 | 
104 | # calculate ordinary and weighted samples state value functions
105 | vqpi_ord, samples_ord = off_policy_mc(states, actions, single_state_transition,
106 |     policy=pi, b=b, ordinary=True, first_visit=True, gamma=1., n_episodes=1E4)
107 | 
108 | vqpi_w, samples_w = off_policy_mc(states, actions, single_state_transition, 
109 |     policy=pi, b=b, ordinary=False, first_visit=True, gamma=1., n_episodes=1E4)
110 | ```
111 | 
112 | ![](https://github.com/ivanbelenky/RL/blob/master/assets/images/ordinary_vs_weighted.png)
113 | 
114 | <br>
115 | 
116 | **Monte Carlo Tree Search maze solving plot**
117 | 
118 | ```python
119 | s = START_XY
120 | budget = 500
121 | cp = 1/np.sqrt(2)
122 | end = False
123 | max_steps = 50
124 | while not end:
125 |     action, tree = mcts(s, cp, budget, obstacle_maze, action_map, max_steps, eps=1)
126 |     (s, _), end = obstacle_maze(s, action)
127 | 
128 | tree.plot()
129 | ```
130 | 
131 | ![](https://github.com/ivanbelenky/RL/blob/master/assets/images/uct.png)
132 | 
133 | <br>
134 | 
135 | # Contributing
136 | 
137 | While the code in this package provides a basic implementation of the algorithms from the book, it is not necessarily the most efficient or well-written. If you have suggestions for improving the code, please feel free to open an issue.
138 | 
139 | Overall, this package provides a valuable resource for anyone interested in learning about reinforcement learning and implementing algorithms from scratch. By no means prod ready.
140 | 


--------------------------------------------------------------------------------
/assets/images/2023-01-24-11-04-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/2023-01-24-11-04-11.png


--------------------------------------------------------------------------------
/assets/images/2023-01-24-11-04-28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/2023-01-24-11-04-28.png


--------------------------------------------------------------------------------
/assets/images/acc_vs_steps_alpha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/acc_vs_steps_alpha.png


--------------------------------------------------------------------------------
/assets/images/accuracy_vs_steps_eps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/accuracy_vs_steps_eps.png


--------------------------------------------------------------------------------
/assets/images/blackjack_rms_vs_ep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/blackjack_rms_vs_ep.png


--------------------------------------------------------------------------------
/assets/images/blackjack_v.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/blackjack_v.png


--------------------------------------------------------------------------------
/assets/images/expected_r_vs_steps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/expected_r_vs_steps.png


--------------------------------------------------------------------------------
/assets/images/expected_r_vs_steps_alpha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/expected_r_vs_steps_alpha.png


--------------------------------------------------------------------------------
/assets/images/infinite_variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/infinite_variance.png


--------------------------------------------------------------------------------
/assets/images/maximization_bias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/maximization_bias.png


--------------------------------------------------------------------------------
/assets/images/mc_vs_td.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/mc_vs_td.png


--------------------------------------------------------------------------------
/assets/images/mountain_car_n1_vs_n8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/mountain_car_n1_vs_n8.png


--------------------------------------------------------------------------------
/assets/images/n_effect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/n_effect.png


--------------------------------------------------------------------------------
/assets/images/ordinary_vs_weighted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/ordinary_vs_weighted.png


--------------------------------------------------------------------------------
/assets/images/q_vs_sarsa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/q_vs_sarsa.png


--------------------------------------------------------------------------------
/assets/images/single_state.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/single_state.png


--------------------------------------------------------------------------------
/assets/images/steps_per_episode_vs_episode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/steps_per_episode_vs_episode.png


--------------------------------------------------------------------------------
/assets/images/ucb_expected_reward_vs_steps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/ucb_expected_reward_vs_steps.png


--------------------------------------------------------------------------------
/assets/images/ucb_steps_vs_acc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/ucb_steps_vs_acc.png


--------------------------------------------------------------------------------
/assets/images/uct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/uct.png


--------------------------------------------------------------------------------
/docs/MDP.md:
--------------------------------------------------------------------------------
  1 | # Markov Decision Process (MDP) Framework
  2 | 
  3 | This code provides a framework for defining and solving Markov Decision Processes (MDPs) in Python. It includes classes for defining MDPs, policies, and rewards, as well as functions for solving MDPs using various algorithms.
  4 | 
  5 | ## Classes 
  6 | ### `MarkovReward`
  7 | 
  8 | The `MarkovReward` class is an abstract base class for generating rewards in an MDP. It defines the `generate()` and `r_sas()` methods, which must be implemented by subclasses.
  9 | TabularReward
 10 | 
 11 | The `TabularReward` class is a concrete implementation of MarkovReward that uses a reward table to generate rewards. It has a constructor that takes a reward table `r_sa` as an input and stores it internally. The `generate()` method returns the reward for a given state and action, and the `r_sas()` method returns the mean reward for the next state. It could be considered as nonsense to create a class just to hold this up, but the idea is to be able to define arbitrarily reward generator functions for each state action pair. This suggests that we are just going to be dealing with independent $p(r,s'|s,a)$. Even continuous ones that 
 12 | 
 13 | ### `MarkovPolicy`
 14 | 
 15 | The `MarkovPolicy` class extends the `Policy` abstract base class and defines a policy for an `MDP`. It has a constructor that takes a policy table `pi_sa` as an input and stores it internally. The `update_policy()` method updates the policy using the given value function `q_pi`, and the `π()` method returns the policy for a given state.
 16 | MDP.
 17 | 
 18 | ### `MDP`
 19 | 
 20 | The `MDP` class represents an MDP and provides methods for solving it. It has a constructor that takes a state transition matrix `p_s`, a list of states, a list of actions, a discount factor `gamma`, and optional `policy` and `reward_gen` objects. The `value_function()` and `optimal_policy()` methods can be used to compute the value function and optimal policy for the MDP using various solvers.
 21 | 
 22 | ### `Solvers`
 23 | 
 24 | The code includes a number of solver functions for computing the value function and optimize policies for an MDP, including `vq_pi_iter_naive`, `policy_iteration`, and `value_iteration`. These solvers can be used with the `MDP` class's `value_function()` and `optimal_policy()` methods to solve an MDP.
 25 | 
 26 | <br>
 27 | 
 28 | # **Dynamic Programming (DP) the cool kid in town**
 29 | 
 30 | DP is the cool kid in town, since everybody is trying to copy him in some or other way. This does not mean that he is the coolest.
 31 | 
 32 | DP is a collection of algorithms that can be used to compute optimal policies
 33 | given a perfect model of the environment as MDP. Extremely limited, given 
 34 | the great computational expense. All methods can be thought as trying to 
 35 | achieve same effect. 
 36 | 
 37 | MDP is finite. Even given continuous examples the approach taken is always
 38 | to quantize it. 
 39 | 
 40 | Optimality equations are operators. The system is guaranteed to have a 
 41 | solution and converge if gamma < 1 or the runs are episodic. If the
 42 | tasks are completely known then this is just a system of linear equations.
 43 | 
 44 | The bellman equation itself, is an operator/mapping whose fixed point is the value function. It is usually called the **Bellman Expectation Operator** or the **Bellman Policy Operator**. It can be proven to converge under reasonable assumptions, like $\gamma < 1$. This method is noted as **iterative policy evaluation**. 
 45 | 
 46 | <br>
 47 | 
 48 | ## **Iterative Evaluation**
 49 | 
 50 | The iterative solution to the expected policy value function, would be written as 
 51 | 
 52 | $$ \color{orange}
 53 | v_{k+1}(s) = { \sum_{a \in A} \pi(a|s) \sum_{s', r} p(s,r|s',a)[ r + \gamma v_k(s')] = \operatorname{B_{\pi}}[v_{k}(s)]  } 
 54 | $$
 55 | 
 56 | where $\operatorname{B_{\pi}}$ is the **Bellman Expectation Operator**, naturally, to considering the bellman equality equation as an operator that acts on the value function. It is easy to see that the actual value function is a fixed point of this operator.
 57 | 
 58 | $$\color{orange}
 59 | \operatorname{B_{\pi}}[v_{\pi}] = v_{\pi}
 60 | $$
 61 | 
 62 | 
 63 | it is easy to show that $\operatorname{B_{\pi}}$ is a contraction mapping under the $L_\infty$ norm
 64 | 
 65 | $$\color{orange}
 66 | \begin{aligned}
 67 | \left|\left|\operatorname{B_{\pi}}[v] - \operatorname{B_{\pi}}[u]\right|\right|_\infty  &= \\ \\
 68 | &= \gamma \|\| \sum_{a \in A} \pi(a|s) \sum_{s', r} p(s,r|s',a)[v(s') - u(s')]\|\|_\infty \\ \\ 
 69 | &\leq \gamma ||v - u||_\infty
 70 | \end{aligned}
 71 | $$
 72 | 
 73 | Given that there is exactly one value function we can show that 
 74 | 
 75 | $$\color{orange}
 76 | \lim_{k\rightarrow \infty} \operatorname{B_{\pi}}[v_0] = v_{\pi}
 77 | $$
 78 | 
 79 | given the fact that 
 80 | 
 81 | $$\color{orange}
 82 | \left|\left|v_{k} - v_{\pi} \right|\right| = \left|\left| \operatorname{B_{\pi}}[v_{k-1}] - \operatorname{B_{\pi}}v_{\pi} \right|\right| \leq  \gamma \left|\left| v_{k-1} - v_{\pi} \right|\right| \leq \cdots \leq \gamma^k \left|\left| v_{0} - v_{\pi} \right|\right| 
 83 | $$
 84 | 
 85 | <br>
 86 | 
 87 | ## **Policy Improvement** 
 88 | 
 89 | As the title suggest, dynamic programming also encompasses methods to solve the optimal problem, that is the best policy there is given an MDP. In the same fashion we can define Optimality Equations for the value function. It can be easily proven by the absurd that the following is true for 
 90 | 
 91 | $$\color{orange}
 92 | v_{*} = \max_{a \in A} q_{\pi_{*}}(s,a) = \max_{a \in A} \sum_{s', r} p(s,r|s',a)[ r + \gamma v_{*}(s')]
 93 | $$
 94 | 
 95 | 
 96 | Mouthful absurd. If the above is not true it is possible to:
 97 | - define $\pi'(s)$ that modifies the policy for all states with the above rule 
 98 | - new policy chooses the action that maximizes the state value function for every $s$. 
 99 | - calculate the value function with this new policy  
100 | - when we encounter each state again the new policy is going to kick in. Since it always gives more reward, and since the value function is composed by discounted rewards, we now have a higher value function policy. Hence an absurd, since $v_{*}$ was already the optimal. 
101 |   
102 | If this is still not clear was a mouthful see that 
103 | 
104 | $$\color{orange}
105 | \begin{aligned}
106 | v_{\pi}(s) &\leq q_{\pi}(s, \pi'(s)) \\
107 | &= \mathbb{E_{\pi'}}[r+\gamma v_{\pi}(s')]\\
108 | &\leq \mathbb{E_{\pi'}}[r+\gamma q_{\pi}(s', \pi'(s'))]\\
109 | &= \mathbb{E_{\pi'}}[r+\gamma r + \gamma^2 v_{\pi}(s', \pi'(s'))]\\
110 | & \ \ \vdots \\ 
111 | &\leq \mathbb{E_{\pi'}}\left[\sum_k r_k \gamma^k \right]\\
112 | &= v_{\pi'(s)}(s)
113 | \end{aligned}
114 | $$
115 | 
116 | ### **Policy Iteration**
117 | 
118 | It is precisely the policy improvement theorem the one that guarantees  that the policy iteration techniques will converge to the optimal policy after going under a couple of iterations.
119 | 
120 | As Sutton, illustrates the Policy Iteration algorithm consists of the following 
121 | 
122 | $$\color{orange}
123 | \pi_0 \overset{\mathbb{E}}{\longrightarrow} v_{\pi_0} \overset{\mathbb{I}}{\longrightarrow} \pi_1 \overset{\mathbb{E}}{\longrightarrow} \cdots \overset{\mathbb{I}}{\longrightarrow} \pi_{*} \overset{\mathbb{E}}{\longrightarrow} v_{\pi_{*}}
124 | $$
125 | 
126 | So this particular solution is quite costly since we have to perform an evaluation step every single time the policy changes, and this is costly, mostly in iterative settings. But there are good news, that is Value Iteration.
127 | 
128 | ### **Value Iteration**
129 | 
130 | $$\color{orange}
131 | v_{\pi}(s)=\sum_{a \in A} \pi(a|s)q_{\pi}(s,a)
132 | $$
133 | 
134 |  We define then the **Bellman Optimality Operator** as 
135 | 
136 | $$\color{orange}
137 | \operatorname{B_{*}}[v(s)] := \max_a[r(s,a) + \gamma v(s)]
138 | $$
139 | 
140 | and we can show that it is a contraction mapping under the $L_\infty$ norm once again, with the help of the following property
141 | 
142 | $$\color{orange}
143 | |\max_a f(a) - \max_a g(a) | \leq \max_a |f(a) - g(a)|
144 | $$
145 | 
146 | then
147 | 
148 | $$\color{orange}
149 | \begin{aligned}
150 | \left|\left|\operatorname{B_{*}}[v] - \operatorname{B_{*}}[u]\right|\right|_\infty  &= \\ \\
151 | &= \gamma \|\| \max_a \sum_{s', r} p(s,r|s',a)[v(s') - u(s')]\|\|_\infty \\ \\
152 | &\leq \gamma ||v - u||_\infty
153 | \end{aligned}
154 | $$
155 | 
156 | once again the optimal value function is a fixed point of the Bellman Optimality Operator, i.e.
157 | 
158 | $$\color{orange}
159 | v_{*} = \operatorname{B_{*}}[v_{*}]
160 | $$
161 | 
162 | implying that an iterative approach can be built such that
163 | 
164 | $$\color{orange}
165 | v_{k+1} = \operatorname{B_{*}}[v_{k}]
166 | $$
167 | 
168 | Since we are guaranteed convergence, we can basically apply policy iteration but going for iterative policy evaluation just with one step.  
169 | 
170 | 
171 | ### **Drawbacks of DP**
172 | 
173 | It is evident that for problems with massive state,action spaces, even with high power compute, DP is not a cost-effective solution. This is because the time complexity of DP is polynomial in the size of the state and action space (assuming the action space stays constant, if it also grows, even worse naturally).
174 | 
175 | Asynchronous DP is a solution to this problem, mainly as the name suggests it suggests that there is not a synchronicity of updates between improvement steps. As Sutton states _"Of course, avoiding sweeps does not necessarily mean that we can get away with less computation. It just means that an algorithm does not need to get locked into any hopeleslly long sweep before it can make progress improving a policy"_. 
176 | 
177 | A few words on Generalized Policy Iteration. 
178 | 
179 | ![](/assets/images/2023-01-24-11-04-28.png)
180 | 
181 | There is a nice inutition behind what is going on whenever we are performing any of the methods described above. By the word greedy, we are stating, that we are going to choose the action that maximizes the value function for the next state locally, not taking into account that we might be selecting this policy with regards to an _outdated_ value function. But given the niceness of the Operators, and by niceness I mean, they are contraction mappings, we are left with the following picture  
182 | 
183 | ![](/assets/images/2023-01-24-11-04-11.png)
184 | 
185 | Each of the steps in GPI fights in opposite directions, the policy improvement is pushing the policy to the greedy optimal solution, and by doing so, making the current value function invalid in some sense. The value function just corrects this invalidity, and by doing so, lets everybody know that the current policy is not optimal. For the techniques shown, given that there is a contraction mapping, and guaranteed convergence to the unique optimal solution, it could be argued that the value function space is convex. 
186 | 
187 | 
188 | <br>
189 | <br>
190 | 
191 | ### Copyright
192 | Copyright © 2023 Iván Belenky. This code is licensed under the MIT License. 
193 | 


--------------------------------------------------------------------------------
/docs/MODELFREE.md:
--------------------------------------------------------------------------------
  1 | # **Monte Carlo Methods**
  2 | 
  3 | _"Monte Carlo methods utilize experience—sample sequences of states, actions, and rewards from actual or simulated interaction with an environment. Learning from actual experience is striking because it requires no prior knowledge of the environment’s dynamics, yet can still attain optimal behavior. Learning from simulated experience is also powerful. Although a model is required, the model need only generate sample transitions, not the complete probability distributions of all possible transitions that is required for dynamic programming (DP). In surprisingly many cases it is easy to generate experience sampled according to the desired probability distributions, but infeasible to obtain the distributions in explicit form."_
  4 | 
  5 | So basically if we have a universe in which we can sample stuff, we dont even have to bother with the model. If we want to simulate it, we must create a transition model, and therefore we leave out the complications of building the actual transition probability functions for each state action pair. It is fair to say that given this approach, it is mandatory to tae an episodic approach, otherwise there is no end to the endeavor.
  6 | 
  7 | ## **Monte Carlo Prediction**
  8 | 
  9 | The goal here is to learn the state-value function. The basic idea behind monte carlo is to average the value function across every episode we have at hand. Lets say that we have to find $\color{orange}v_{\pi}(s)$. Each occurence of state $\color{orange}s$ in an episode is called a _visit_ to $\color{orange}s$. 
 10 | 
 11 | ### **First visit Monte Carlo**
 12 | 
 13 | First visit just averages all episodes after the _first visit to each state_ $\color{orange}s$
 14 | 
 15 | Given 
 16 | - Input: $\color{orange}\pi$
 17 | - Initialize
 18 |   -  state-value: $\color{orange}v(s)$
 19 |   -  returns for each state: $\color{orange}R(s)$ 
 20 | -  While some condition is true (tolerance, amount of iterations)
 21 |    -  Generate an episode following policy $\color{orange}{\pi \rightarrow  S_{0}, A_{0}, R_{1}, \cdots , S_{T-1}, A_{T-1}, R_{T}}$
 22 |    -  $\color{orange}G \leftarrow 0$
 23 |    -  loop `episode[::-1]`, i.e. $\color{orange}T-1, T-2, \cdots, 0$
 24 |       -  $\color{orange}G \leftarrow \gamma G + R_{t+1}$
 25 |       -  if state $\color{orange}s$ not in $\color{orange}[S_0, S_1, \cdots, S_{t-1} ]$:
 26 |          -  $\color{orange} R(s) \leftarrow \text{append} \ G$
 27 |          -  $\color{orange} v_{\pi}(s) \leftarrow avg(R(s))$  
 28 | 
 29 | 
 30 | The implementation is arbitrary. If it gets complicated with the backwards iteration, and the forward iteration, check the natural implementation following the basic principles of the algorithm, isomorphic. 
 31 | 
 32 | If a model is not available is particualrly useful to estimate action values, because otherwise it would be difficult to asses what is the best action to take given a state, not knowing what is the space of possible next states. In DP state values is all you want to know, since you have the modl. In MC you dont have the model, so you need to estimate the action values. 
 33 | 
 34 | Monte Carlo methods for estimating action values are exactly the same as the above, but instead of averaging the returns for each state, you average the returns for each state action pair. 
 35 | 
 36 | Major drawback: you may not visit all state action pairs. Edge example, if $\color{orange}\pi$ is deterministic. Exploration vs exploitation, and/or maintaining exploration. Solutions
 37 | - `exploring starts`: make sure that all episodes start ant a specific state-action pair, and that every pair has a nonzero probability of being selected as the start
 38 | - `stochastic selection of all possible action`: select sometimes a random action, remember epsilon policies with Multi Armed Bandits. 
 39 | 
 40 | ### **Monte Carlo Control**
 41 | 
 42 | We can do basically the same as before: evaluate, improve, evaluate, improve, etc... but this time we are not able to make a policy greedy just by using value functions, since the model is lacking. Therefore we have a very similar picture as in DP but this time 
 43 | 
 44 | $$\color{orange}{
 45 | \pi_0 \overset{\mathbb{E}}{\longrightarrow} q_{\pi_0} \overset{\mathbb{I}}{\longrightarrow} \pi_1 \overset{\mathbb{E}}{\longrightarrow} \cdots \overset{\mathbb{I}}{\longrightarrow} \pi_{\*} \overset{\mathbb{E}}{\longrightarrow} q_{\pi_{\*}}
 46 | }
 47 | $$
 48 | 
 49 | so we are selecting the policy $\color{orange}\pi_{k+1}$ greedy with respect to $\color{orange}q_{\pi_{k}}$.
 50 | 
 51 | 
 52 | We can replicate value iteration and policy iteration in some sense. That is we can try to estimate hardly the `q` function or we could also try to improve on an episode on episode basis. 
 53 | 
 54 | 
 55 | ### **Monte Carlo with Exploring Starts**
 56 | 
 57 | Monte Carlo with **exploring starts** is the natural way to implement this idea
 58 | 
 59 | **ES**
 60 | 
 61 | - Input: $\color{orange}\pi$
 62 | - $\color{orange} G\leftarrow 0$
 63 | - $\color{orange} Q(s,a) \leftarrow q_0(s,a) \in \mathbb(R)$
 64 | - $\color{orange} R(s,a) \leftarrow \emptyset$ 
 65 | - Loop forever 
 66 |   - generate episode $\color{orange}S_{0}, A_{0}, R_{1}, \cdots , S_{T-1}, A_{T-1}, R_{T}$ making sure that the first state action pair is selected randomly
 67 |   - loop `episode[::-1]` : `index:T--`
 68 |     - $\color{orange}G\leftarrow \gamma G + R_{t+1}$
 69 |     - if `(s,a)` present not in episode `episode[:T]:
 70 |       - $\color{orange}R(s,a) \rightarrow  \text{append} (G)$ 
 71 |       - $\color{orange}Q(s,a) \rightarrow avg(R(s,a))$
 72 |       - $\color{orange}\pi(s) \rightarrow \text{greedy} (Q(s,a))$
 73 | 
 74 | 
 75 | ### **Monte Carlo without Exploring Starts**
 76 | 
 77 | There is a basic separation in policy improvement algorithms, and MC w/o Exploring Starts seems a nice way to introduce as Sutton does. On/Off policy.
 78 | 
 79 | - On Policy uses the same policy as  the one that is optimizing
 80 | - Off Policy uses one policy to optimize and another one to _search_ or **generate the data**. 
 81 | 
 82 | This is like learning from someone else's experience vs our own. On the latter we must concentrate on knowing how good we are at the task, and try to navigate the behavior space in a way such that we maximize a goal, or minimize a cost. The experience to which we are going to be exposed while investigating it is going to be somewhat biased by our current one, so we are somewhat sensitive to local minima you would say. When learning from someone else (or elses) in principle we are not prisoners of our biased trajectories. But nevertheless we need to be able to say to a degree that the current actions are somewhat compatible with our history. We can basically explore but weighting the exploration on how useful it is to us.
 83 | 
 84 | ES is somewhat not realizable since it may be the case that some state action pairs are never visited. 
 85 | 
 86 | - Input: $\color{orange}\pi$
 87 | - $\color{orange} G\leftarrow 0$
 88 | - $\color{orange} Q(s,a) \leftarrow q_0(s,a) \in \mathbb(R)$
 89 | - $\color{orange} R(s,a) \leftarrow \emptyset$ 
 90 | - Loop forever 
 91 |   - generate episode $\color{orange}S_{0}, A_{0}, R_{1}, \cdots , S_{T-1}, A_{T-1}, R_{T}$ making sure that the first state action pair is selected randomly
 92 |   - loop `episode[::-1]` : `index:(t=T-1; t>-; t--)`
 93 |     - $\color{orange}G\leftarrow \gamma G + R_{t+1}$
 94 |     - if `(s,a)` present not in episode `episode[:T]`:
 95 |       - $\color{orange}R(s,a) \rightarrow  \text{append} (G)$ 
 96 |       - $\color{orange}Q(s,a) \rightarrow avg(R(s,a))$
 97 |       - $\color{orange}\pi(s) \rightarrow \text{greedy} (Q(s,a))$
 98 |       - $\color{orange}\forall a \in \mathbin{A}(S_t)$
 99 |         - if $\color{orange}a=\argmax_a Q(s,a) \rightarrow \pi(a|s) = 1-\varepsilon + \frac{\varepsilon}{|\mathbin{A}(S_t)}$ else $\color{orange}\pi(a|s) = \frac{\varepsilon}{|\mathbin{A}(S_t)}$
100 | 
101 | 
102 | The above algorithm optimizes over the $\color{orange}\varepsilon-soft$ policies, described as to be policies that have non-zero probability of selecting any action under all possible states. That is we are optimizing over a modified transitional operator. This is a word in which sometimes noise kicks you out of what seems to be the best policy, so in turn you learn to optimize assuming that sometimes you may be kicked out from a local optimum. 
103 | 
104 | ### **Off-Policy Importance Sampling**
105 | 
106 | How to mantain exploration and at the same time explore all possible actions, to find the potential better ones. The above algorithm implies a compromise, since we are optimizing over a near-optimal policy that still explores. So an alternative can  be to have two policies
107 | - one that explores: `behavior policy`
108 | - one that gets optimized: `target policy`
109 | 
110 | Off policy vs On policy tradeoffs
111 | - harder - simpler
112 | - more data - less data
113 | - more variance - less variance
114 | - more time for convergence - less time for convergence
115 | - general framework, superset that includes `on policy` - special case of `behavior = target`
116 | - learn from whatever you choose - learn from what you do 
117 | 
118 | **Importance sampling** comes into place here. This is just a technique for estimating expected values under one distribution, given samples from another one. The following example is quite intuitive
119 | 
120 | $$
121 | \color{orange}{
122 | \mathbb{E}_{\sim p}(f) = \int f(x) p(x) dx = \mathbb{E}_{\sim q}(f \cdot p/q) = \int \frac{f(x)p(x)}{q(x)}q(x) dx
123 | }
124 | $$
125 | 
126 | So in essence what `IS` is doing doing is: weighting each point in probability space by a factor that is proportionate to how likely is to sample from $\color{orange}p$ instead of $\color{orange}q$.
127 | 
128 | In the case f episodes or state-action trajectories, we get that the probability of obtaining a trajectory $\color{orange}A_t, S_{t+1}, \cdots, S_T$ under policy $\color{orange}\pi$ is 
129 | 
130 | $$
131 | \color{orange}{
132 |    Pr\left\{ A_t, S_{t+1}, \cdots, S_T | S_t, A_{t:T-1} \sim \pi \right\} = \prod_{k=t}^{T-1}\pi(A_k|S_k) \cdot p(S_{k+1}|S_k, A_k)
133 | } 
134 | $$
135 | 
136 | The same applies to any policy. If the behavior policy is $\color{orange}b$, the probability of obtaining a particuar trajectory is
137 | 
138 | $$\color{orange}{
139 | \prod_{k=t}^{T-1}b(A_k|S_k) \cdot p(S_{k+1}|S_k, A_k)
140 | }
141 | $$
142 | 
143 | even if the way the world transitions is hidden, i.e. how the world makes an update to the global state, we can 
144 | 
145 | $$\color{orange}{
146 |    \frac{\prod_{k=t}^{T-1}\pi(A_k|S_k) \cdot p(S_{k+1}|S_k, A_k)}{\prod_{k=t}^{T-1}b(A_k|S_k) \cdot p(S_{k+1}|S_k, A_k)} = \frac{\prod_{k=t}^{T-1}\pi(A_k|S_k)}{\prod_{k=t}^{T-1}b(A_k|S_k)} = \rho_{t:T-1}
147 | }
148 | $$
149 | 
150 | and is $\color{orange}\rho_{t:T-1}$ the weighting factor for importance sampling. After this all that is left to underttand is that 
151 | - the probability space from which the samples are withdrawn corresponsd to the state-action trajectories 
152 | - the expectation operator has to be applied to the returns $\color{orange}$ 
153 | - returns are a mapping from trajectories to real numbers
154 | - expectation is going to be the mean of the returns 
155 | 
156 | Extra notation:
157 | - $\color{orange}\tau(s)$: set of time steps in which state $\color{orange}s$ was visited. For every visit this is. For first time, it would be the set of all time steps that were first visits to s within their respective episodes. 
158 | - $\color{orange} T(t)$: index of the last time step in the episode belonging to the range $\color{orange}[t, T-1]$
159 | - $\color{orange}G(t)$: return after t up through $\color{orange}T(t)$. 
160 | 
161 | we define then `ordinary importance sampling` as
162 | 
163 | $$\color{orange}{
164 |    V(s) = \frac{\sum_{t\in \tau(s)} \rho_{t:T(t)-1}G_t}{|\tau(s)|}
165 | }
166 | $$
167 | 
168 | and `weighted importance sampling` as
169 | 
170 | $$\color{orange}{
171 |    V(s) = \frac{\sum_{t\in \tau(s)} \rho_{t:T(t)-1}G_t}{\sum_{t\in \tau(s)} \rho_{t:T(t)-1}}
172 | }
173 | $$
174 | 
175 | basic differences between these two
176 | - first visit
177 |   - `ordinary` is unbiased for first visit but can be extremely variant since the ratios are not bounded. 
178 |   - `weighted` is biased (although it converges to zero) and its variance is bounded by the maximum return. 
179 | - every visit
180 |   - `ordinary` biased but converges to zero
181 |   - `weighted` biased but converges to zero
182 | 
183 | Down below a nice example displays the convergence problems for ordinary importance samples.
184 | 
185 | ![](/assets/images/infinite_variance.png)
186 | 
187 | <br/>
188 | 
189 | **First-Visit off policy evaluation naive implementation**
190 | 
191 | - Input: $\color{orange}\pi$
192 | - $\color{orange} Q(s,a) \leftarrow q_0(s,a) \in \mathbb(R)$
193 | - $\color{orange} R(s,a) \leftarrow \emptyset$ 
194 | - $\color{orange} \tau(s,a) \leftarrow \emptyset$
195 | - $\color{orange} \rho(s,a) \leftarrow \emptyset$
196 | - Loop some number of episodes 
197 |   - $\color{orange} b\leftarrow$ any policy with coverage of $\color{orange}\pi$
198 |   - $\color{orange} G\leftarrow 0$
199 |   - $\color{orange} W \leftarrow 1$ 
200 |   - generate episode with $\color{orange}b \rightarrow S_{0}, A_{0}, R_{1}, \cdots , S_{T-1}, A_{T-1}, R_{T}$
201 |   - loop `episode[::-1]` - `index:t => (t=T; t>0; t--)`
202 |     - $\color{orange}G\leftarrow \gamma G + R_{t+1}$
203 |     - $\color{orange}W\leftarrow W \cdot \frac{\pi(a_t|s_t)}{b(a_t|s_t)}$
204 |     - if $\color{orange}W=0$ then break
205 |     - if `(s,a)` not in episode `episode[:T]`:
206 |       - $\color{orange}\tau(s,a) \leftarrow append(t)$
207 |       - $\color{orange}R(s,a) \leftarrow  \text{append} (G)$ 
208 |       - $\color{orange}\rho(s,a) \leftarrow  \text{append} (W)$
209 |   - $\color{orange} Q(s,a) \leftarrow \frac{\sum \rho R(s,a)}{\sum \rho}$
210 | 
211 | 
212 | <br/>
213 | 
214 | **Incremental implementation**
215 | 
216 | The methods described in the multi armed bandits sections can easily be applied when implementing incremental versions of the montecarlo algorithms displayed in this notes. The one difference on the update rule corresponds to `weighted` averages since this is just not only dependent on count. 
217 | 
218 | Given a set of returns $\color{orange}{G}_k$ and a respective set of weights $\color{orange}\rho_k$ the weighted average is 
219 | 
220 | $$\color{orange}{
221 |    V_{K+1}(s) = \frac{\rho_k G_k}{\rho_{kk}} = \frac{\sum_{k=1}^K \rho_k G_k}{\sum_{k=1}^K \rho_k}
222 | }
223 | $$ 
224 | 
225 | therefore 
226 | 
227 | $$\color{orange}{
228 |    V_{K+1}(s) = \frac{\sum_{k}^{K-1}{\rho_k G_k} + \rho_K G_K}{\sum_{k}^{K} \rho_k}  = V_K + \frac{\rho_K}{\sum_k^K \rho_k} (G_K - V_K )
229 | }
230 | $$
231 | 
232 | and this can get implemented without the need to save list of returns and weights, just the last one. 
233 | 
234 | <br/>
235 | 
236 | ### **Off Policy Control**
237 | 
238 | This is the fun part. Using off-policy methods to do policy improvement. So lets enumerate the concepts and musts
239 | - `behavior` policy $\color{orange}b$ is going to generate the episodes.
240 |   - `coverage` must be guaranteed
241 |   - must be soft, i.e. $\color{orange}b(a|s)\geq 0 \ \forall s \in S, \ a \in A$
242 | - `target` policy is the one that is going to be greedy with respect to the q function. 
243 | 
244 | **Off-policy MC control incremental implementation for finding $\color{orange}\pi \approx \pi^{*}$**
245 | 
246 | - Intialize:
247 |   - $\color{orange} Q(s,a) \leftarrow q_0(s,a) \in \mathbb(R)$
248 |   - $\color{orange} C(s,a) \leftarrow 0$ 
249 |   - $\color{orange} \pi(s) = \argmax_{a} Q(s,a)$
250 | - Loop some number of episodes 
251 |   - $\color{orange} b\leftarrow$ any soft policy with coverage of $\color{orange}\pi$
252 |   - $\color{orange} G\leftarrow 0$
253 |   - $\color{orange} W \leftarrow 1$ 
254 |   - generate episode with $\color{orange}b \rightarrow S_{0}, A_{0}, R_{1}, \cdots , S_{T-1}, A_{T-1}, R_{T}$
255 |   - loop `episode[::-1]` - `index:t => (t=T-1; t>0; t--)`
256 |     - $\color{orange}G\leftarrow \gamma G + R_{t+1}$
257 |     - $\color{orange}C(s,a) \leftarrow C(s,a) + W$
258 |     - $\color{orange}Q(s,a) \leftarrow Q(s,a) + \frac{W}{C(s,a)}(Q(s,a)-G)$
259 |     - $\color{orange}\pi(s,a) \leftarrow \argmax_{a}Q(s,a)$
260 |     - if $\color{orange}\pi(s_t)\neq a_t \rightarrow $ continue
261 |     - $\color{orange}W \leftarrow W\cdot \frac{1}{b(a|s)}$
262 |     
263 | 
264 | <br/>
265 | 
266 | As a final remark is worth noting that there exists two other algorithms in the book, more specialized to reduce variance, and faster convergence. They do not provide a core conceptual understanding of the problem, and how Monte Carlo solves it. 
267 | - `discounted aware importance sample`
268 | - `per-decision importance sample`
269 | 
270 | 
271 | <br/>
272 | 
273 | 
274 | 
275 | 
276 | ## Monte Carlo Tree Search MCTS
277 | 
278 | MCTS is a Monte Carlo method used in planning and decision making. It balances the exploitation exploration issue. It can succeed even with little domain knowledge. 
279 | 
280 | The basic implementation is very simple. 
281 | - A tree is built in an incremental and asymmetric manner. 
282 | - For each iteration a _tree policy_ is used to find best node to expand. This is the policy that tries to balance exploration and exploitation. 
283 | -  Simulation is then run from the leaf node that was selected, and the node is updated accoridng to the result of this simulation (i.e. the reward and backpropagate statistics). There exists a _default policy_ specifies how to simulate from a given state. It can be really simple, uniform for instance.
284 | 
285 | 
286 | 2012 literature review paper quote:
287 | 
288 | _However, it is really the success in computer Go, through the recursive application of Monte Carlo methods during the tree-building process, which has been responsible for much of the interest in MCTS. This is because Go is one of the few classic games for which human players are so far ahead of computer players. MCTS has had a dramatic effect on narrowing this gap, and is now competitive with the very best human players on small boards, though MCTS falls far short of their level on the standard 19⇥19 board._ 
289 | 
290 | 4 years later in 2016, MCTS was able to beat the world champion of Go, Lee Sedol.
291 | 
292 | The algorithm consists as already stated on building a search tree until some predefined constraint has been reached. 
293 | 
294 | - Selection: from the root node, a child is selected by recursively applying a tree policy. If a nonterminal state that has not yet been visited is encountered. We halt.
295 | - Expansion: one (or more) child nodes are created and expand the tree, according to the available actions.
296 | - Simulation: a simulation is run from the expanded node, using a default policy to produce an outcome.
297 | - Backpropagation: through the selected nodes the statistics are updated.
298 | 
299 | - _Tree Policy_: select/create leaf nodes to expand the tree. Selection and expansion.
300 | - _Default Policy_: play out the domain from a given non-terminal state. 
301 | 
302 | Finite Horizon, finite size MDPs, based on random episode sampling structured as a decision tree. There exists some requisites that are listed below for MCTS use.
303 | 
304 | - state-action space must be finite
305 | - MDP must be finite horizon
306 | - MDP must be undiscounted i.e. $\color{orange}\gamma = 1$. 
307 | 
308 | 
309 | <br>
310 | <br>
311 | 
312 | ### Copyright
313 | Copyright © 2023 Iván Belenky. The code in this repository is licensed under the MIT License.
314 | All this notes correspond to Sutton's book, this is just a summary. 
315 | 


--------------------------------------------------------------------------------
/examples/blackjack.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from rl.solvers.model_free import (
 4 |     alpha_mc,
 5 |     off_policy_mc,
 6 |     tdn
 7 | )
 8 | 
 9 | VALUES = ['A','2','3','4','5','6','7','8','9','10','J','Q','K']
10 | SUITS = ['♠','♥','♦','♣']
11 | CARDS = [(value,suit) for value in VALUES for suit in SUITS]
12 | 
13 | states = [(i, False, dealer_showing) 
14 |     for i in range(4,21) 
15 |     for dealer_showing in VALUES]
16 | 
17 | states += [(i, True, dealer_showing) 
18 |     for i in range(12,21)
19 |     for dealer_showing in VALUES]
20 | 
21 | actions = ['hit', 'stand']
22 | 
23 | def count(cards):
24 |     counts = [0]
25 |     for value in cards:
26 |         if value in ['J','Q','K']:
27 |             counts = [c+10 for c in counts]
28 |         elif value == 'A':
29 |             counts = [c+1 for c in counts] + [c+11 for c in counts]
30 |         else:
31 |             counts = [c+int(value) for c in counts]
32 | 
33 |     valid_counts = [c for c in counts if c <= 21]
34 |     if len(valid_counts) == 0:
35 |         return min(counts)
36 |     return max(valid_counts)
37 | 
38 | 
39 | def black_jack_transition(state, action):
40 |     player_sum, usable_ace, dealer_showing = state
41 |     
42 |     if action == 'hit' and player_sum < 21:
43 |         new_card = random.choice(VALUES)
44 |         if new_card == 'A':
45 |             if player_sum + 11 > 21:
46 |                 card_value = 1 
47 |             else:
48 |                 card_value = 11
49 |                 usable_ace = True
50 |         elif new_card in ['J','Q','K']:
51 |             card_value = 10
52 |         else:
53 |             card_value = int(new_card)
54 | 
55 |         player_sum += card_value
56 |         if usable_ace and player_sum > 21:
57 |             player_sum -= 10
58 |             usable_ace = False
59 | 
60 |         if player_sum > 21:
61 |             return (state, -1.), True
62 |         elif player_sum == 21:
63 |             pass
64 |         else:
65 |             new_state = (player_sum, usable_ace, dealer_showing)
66 |             return (new_state, 0.), False
67 |     
68 |     dealer_cards = [dealer_showing]
69 |     dealer_sum = count(dealer_cards)
70 |     if action == 'stand':
71 |         dealer_plays = True
72 |         while dealer_plays:
73 |             dealer_sum = count(dealer_cards)
74 |             if dealer_sum < 17:
75 |                 dealer_cards.append(random.choice(VALUES))
76 |                 continue
77 |             elif dealer_sum > 21:
78 |                 return (state, 1.), True
79 |             elif 17 <= dealer_sum < 22:
80 |                 dealer_plays = False
81 |     
82 |     if dealer_sum > player_sum:
83 |         return (state, -1.), True
84 |     elif dealer_sum < player_sum:
85 |         return (state, 1.), True
86 |     elif dealer_sum == player_sum:
87 |         return (state, 0.), True
88 | 
89 | 
90 | vqpi, samples = alpha_mc(states, actions, black_jack_transition, gamma=0.9,
91 |     use_N=True, n_episodes=1E4, first_visit=False)
92 | 


--------------------------------------------------------------------------------
/examples/dyna_maze.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | plt.style.use('dark_background')
  4 | 
  5 | from rl import dynaq, ModelFree
  6 | 
  7 | GRID_HEIGHT, GRID_WIDTH = 6, 9
  8 | START_XY, GOAL_XY = (0,3), (8,5)
  9 | OBSTACLES = [(2,2), (2,3), (2,4), (5,1), (7,3), (7,4), (7,5)]
 10 | 
 11 | states = [(x,y) for x in range(GRID_WIDTH) for y in range(GRID_HEIGHT) 
 12 |     if (x,y) not in OBSTACLES]
 13 | actions = ['left', 'right', 'up', 'down'] 
 14 | 
 15 | 
 16 | def obstacle_maze(state, action):
 17 |     x,y = state
 18 |     x_n, y_n = x, y
 19 | 
 20 |     if (x,y) == GOAL_XY:
 21 |         return (state, 1), True
 22 | 
 23 |     if action == 'left':
 24 |         x_n -= 1
 25 |     if action == 'right':
 26 |         x_n += 1
 27 |     if action == 'up':
 28 |         y_n += 1
 29 |     if action == 'down':
 30 |         y_n -= 1
 31 |     
 32 |     if x_n < 0 or x_n >= GRID_WIDTH:
 33 |         x_n = x
 34 |     if y_n < 0 or y_n >= GRID_HEIGHT:
 35 |         y_n = y
 36 |     if (x_n, y_n) in OBSTACLES:
 37 |         x_n, y_n = x, y
 38 | 
 39 |     state_n = (x_n, y_n)
 40 |     return (state_n, 0), False
 41 | 
 42 | 
 43 | vqpi_0, samples_0 = dynaq(states, actions, obstacle_maze, START_XY,
 44 |     n_episodes=50, gamma=0.95, alpha=0.5, eps=0.1, n=0, max_steps=2E3)
 45 | 
 46 | # plot found policy
 47 | final_policy = samples_0[-1][-1]
 48 | mf = ModelFree(states, actions, obstacle_maze, gamma=0.95, policy=final_policy)
 49 | 
 50 | lrud = ['<', '>', '^', 'v']
 51 | pi = vqpi_0[2].pi
 52 | 
 53 | plt.figure(figsize=(6,6))
 54 | for s, p in zip(states, pi):
 55 |     marker = lrud[np.argmax(p)]
 56 |     plt.scatter(s[0], s[1], c='red', marker=marker)
 57 | 
 58 | for x,y in OBSTACLES:
 59 |     plt.scatter(x, y, c='white', marker='s')
 60 | 
 61 | plt.xticks([])
 62 | plt.yticks([])
 63 | plt.show()
 64 | 
 65 | 
 66 | # steps per episode function of N planning steps
 67 | 
 68 | NS = [0, 5, 50]
 69 | SMOOTH = 30
 70 | 
 71 | model = ModelFree(states, actions, obstacle_maze,gamma=0.95)
 72 | init_state = model.states.get_index(START_XY)
 73 | 
 74 | all_steps_per_episode = []
 75 | for i in range(SMOOTH):
 76 |     vqpi_0, samples_0 = dynaq(states, actions, obstacle_maze, START_XY, 
 77 |         n_episodes=50, gamma=0.95, alpha=0.1, eps=0.1, n=0, max_steps=1E4)
 78 |     vqpi_5, samples_5 = dynaq(states, actions, obstacle_maze, START_XY, 
 79 |         n_episodes=50, gamma=0.95, alpha=0.1, eps=0.1, n=5, max_steps=1E4)
 80 |     vqpi_50, samples_50 = dynaq(states, actions, obstacle_maze, START_XY,
 81 |         n_episodes=50, gamma=0.95, alpha=0.1, eps=0.1, n=50, max_steps=1E4)
 82 |     
 83 |     steps_per_episode = []
 84 |     for s0, s5, s50 in zip(samples_0, samples_5, samples_50):
 85 |         pi0, pi5, pi50 = s0[3], s5[3], s50[3]
 86 |         a0, a5, a50 = pi0(init_state), pi5(init_state), pi50(init_state)
 87 | 
 88 |         ep0 = model.generate_episode(START_XY, actions[a0] ,policy=pi0)
 89 |         ep5 = model.generate_episode(START_XY, actions[a5] ,policy=pi5)
 90 |         ep50 = model.generate_episode(START_XY, actions[a50] ,policy=pi50)
 91 | 
 92 |         steps_per_episode.append([len(ep0), len(ep5), len(ep50)])
 93 |     all_steps_per_episode.append(steps_per_episode)
 94 | 
 95 | 
 96 | steps_per_episode = np.mean(all_steps_per_episode, axis=0)
 97 | 
 98 | mean_ep_steps = np.mean(all_steps_per_episode, axis=0)
 99 | 
100 | plt.figure(figsize=(6,6))
101 | for i, n in enumerate(NS):
102 |     plt.plot(mean_ep_steps[1:,i], linewidth=2, label='n={}'.format(n))
103 | plt.legend(loc=1)
104 | plt.xlabel('Episode')
105 | plt.ylabel('Steps per episode')
106 | 
107 | plt.show()


--------------------------------------------------------------------------------
/examples/gridworld.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RL - Copyright © 2023 Iván Belenky @Leculette
  3 | """
  4 | import sys
  5 | 
  6 | import numpy as np
  7 | 
  8 | from rl.mdp import MDP, TabularReward
  9 | 
 10 | GRID_SIZE = 5 # 5x5 gridworld
 11 | 
 12 | def main():
 13 |     optimization_method = sys.argv[1] if len(sys.argv) > 1 else None
 14 |     if optimization_method is None:
 15 |         print("No optimization method specified")
 16 |         return
 17 |         
 18 |     actions = np.arange(4) # up, right, down, left
 19 |     states = np.arange(GRID_SIZE**2)
 20 |     p_s = np.zeros((GRID_SIZE**2, 4, GRID_SIZE**2))
 21 | 
 22 |     #initialized the transition matrix
 23 |     for i in range(GRID_SIZE):
 24 |         for j in range(GRID_SIZE):
 25 |             state_idx = i*GRID_SIZE+j
 26 |             p_s[state_idx][0][max(i-1,0)*GRID_SIZE+j] = 1
 27 |             p_s[state_idx][1][i*GRID_SIZE+min(j+1,GRID_SIZE-1)] = 1
 28 |             p_s[state_idx][2][min(i+1,GRID_SIZE-1)*GRID_SIZE+j] = 1
 29 |             p_s[state_idx][3][i*GRID_SIZE+max(j-1,0)] = 1
 30 | 
 31 |     #rewrite probs for potential positions of teleport         
 32 |     p_s[0][1] = np.zeros(GRID_SIZE**2)
 33 |     p_s[0][1][21] = 1
 34 | 
 35 |     p_s[2][3] = np.zeros(GRID_SIZE**2) 
 36 |     p_s[2][3][21] = 1
 37 | 
 38 |     p_s[6][0] = np.zeros(GRID_SIZE**2)
 39 |     p_s[6][0][21] = 1 
 40 | 
 41 |     p_s[2][1] = np.zeros(GRID_SIZE**2)
 42 |     p_s[2][1][13] = 1
 43 | 
 44 |     p_s[4][3] = np.zeros(GRID_SIZE**2)
 45 |     p_s[4][3][13] = 1
 46 | 
 47 |     p_s[8][0] = np.zeros(GRID_SIZE**2)
 48 |     p_s[8][0][13] = 1
 49 | 
 50 | 
 51 |     #by not specifying the policy we get a equal prob one
 52 |     #it is fair to notice that this init process is tedious for tabular MDPs
 53 |     
 54 |     #initializing reward, if we land on target 
 55 |     r_sa = np.zeros((GRID_SIZE**2, 4))
 56 |     
 57 |     #Border.
 58 |     #If it try to go out of the grid, it gets -1 reward
 59 |     for i in range(GRID_SIZE):
 60 |         r_sa[i][0] = -1
 61 |         r_sa[i*GRID_SIZE+GRID_SIZE-1][1] = -1
 62 |         r_sa[i*GRID_SIZE][3] = -1
 63 |         r_sa[GRID_SIZE*(GRID_SIZE-1)+i][2] = -1
 64 | 
 65 |     #A
 66 |     #If lands on (0,1) position it gets a reward of +10
 67 |     r_sa[0][1] += 10
 68 |     r_sa[2][3] += 10
 69 |     r_sa[6][0] += 10
 70 | 
 71 |     #B
 72 |     #If lands on (0,3) position it gets a reward of +5
 73 |     r_sa[2][1] += 5
 74 |     r_sa[4][3] += 5
 75 |     r_sa[8][0] += 5
 76 | 
 77 |     print("Reward matrix going up")
 78 |     print(r_sa[:,0].reshape(GRID_SIZE,GRID_SIZE))
 79 |     print("Reward matrix going right")
 80 |     print(r_sa[:,1].reshape(GRID_SIZE,GRID_SIZE))
 81 |     print("Reward matrix going down")
 82 |     print(r_sa[:,2].reshape(GRID_SIZE,GRID_SIZE))
 83 |     print("Reward matrix going left")
 84 |     print(r_sa[:,3].reshape(GRID_SIZE,GRID_SIZE))
 85 |     
 86 |     #Define the Markov Decision Process
 87 |     mdp = MDP(p_s, states, actions, gamma = 0.9, 
 88 |         reward_gen=TabularReward(r_sa))
 89 | 
 90 |     #calculate beforehand
 91 |     v, q = mdp.vq_pi()
 92 |     print("Value Function before optimizing")
 93 |     print(v.reshape(GRID_SIZE,GRID_SIZE))
 94 |     print('-'*50)
 95 |     print("Q Function before optimizing")
 96 |     print(q.reshape(GRID_SIZE,GRID_SIZE,4))
 97 |     print('\n')
 98 | 
 99 |     mdp.optimize_policy(method=optimization_method)
100 |     v, q = mdp.vq_pi()
101 |     print("Value Function after optimizing")
102 |     print(v.reshape(GRID_SIZE,GRID_SIZE))
103 |     print('-'*50)
104 |     print("Q Function after optimizing")
105 |     print(q.reshape(GRID_SIZE,GRID_SIZE,4))
106 |     print('\n')
107 |  
108 |     print("Optimal policy up action")
109 |     print(mdp.policy.pi_sa[:,0].reshape(GRID_SIZE, GRID_SIZE))
110 |     print("Optimal policy right action")
111 |     print(mdp.policy.pi_sa[:,1].reshape(GRID_SIZE, GRID_SIZE))
112 |     print("Optimal policy down action")
113 |     print(mdp.policy.pi_sa[:,2].reshape(GRID_SIZE, GRID_SIZE))
114 |     print("Optimal policy left action")
115 |     print(mdp.policy.pi_sa[:,3].reshape(GRID_SIZE, GRID_SIZE))
116 |     
117 | 
118 | if __name__ == '__main__':
119 |     main()


--------------------------------------------------------------------------------
/examples/mcts.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | plt.style.use("dark_background")
 4 | 
 5 | from rl import mcts
 6 | 
 7 | GRID_HEIGHT, GRID_WIDTH = 6, 9
 8 | START_XY, GOAL_XY = (0,3), (8,5)
 9 | OBSTACLES = [(2,2), (2,3), (2,4), (5,1), (7,3), (7,4), (7,5)]
10 | 
11 | states = [(x,y) for x in range(GRID_WIDTH) for y in range(GRID_HEIGHT) 
12 |     if (x,y) not in OBSTACLES]
13 | actions = ['left', 'right', 'up', 'down'] 
14 | 
15 | 
16 | def obstacle_maze(state, action):
17 |     x,y = state
18 |     x_n, y_n = x, y
19 | 
20 |     reward = -0.05
21 |     if action == 'left':
22 |         x_n -= 1
23 |     if action == 'right':
24 |         x_n += 1
25 |     if action == 'up':
26 |         y_n += 1
27 |     if action == 'down':
28 |         y_n -= 1
29 |     
30 |     if x_n < 0 or x_n >= GRID_WIDTH:
31 |         x_n = x
32 |     if y_n < 0 or y_n >= GRID_HEIGHT:
33 |         y_n = y
34 |     if (x_n, y_n) in OBSTACLES:
35 |         x_n, y_n = x, y
36 | 
37 |     state_n = (x_n, y_n)
38 |     if state_n == GOAL_XY:
39 |         return (state_n, 1), True
40 |     return (state_n, reward), False
41 | 
42 | def action_map(state):
43 |     possible_actions = []
44 |     for a in actions:
45 |         (s, _), _ = obstacle_maze(state, a)
46 |         if s != state:
47 |             possible_actions.append(a)
48 |     return possible_actions
49 | 
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     s = START_XY
54 |     end = False
55 |     tree = None
56 |     while not end:
57 |         action, _ = mcts(s, 0.0, 500, obstacle_maze, action_map, 25, eps=1)
58 |         print(s, action)
59 |         (s, _), end = obstacle_maze(s, action)    
60 | 
61 |     tree.plot()


--------------------------------------------------------------------------------
/examples/mountain_car.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | plt.style.use("dark_background")
 4 | 
 5 | from rl import semigrad_tdn, gradient_mc, IHT, tiles
 6 | from rl.approximators import LinearApproximator
 7 | 
 8 | 
 9 | ACTIONS = (-1, 0, 1)
10 | X_BOUND = [-1.2, 0.5]
11 | V_BOUND = [-0.07, 0.07]
12 | 
13 | 
14 | def mountain_car(state, action):
15 |     x, v = state
16 |     new_v = v + 0.001*action - 0.0025*np.cos(3*x)
17 |     new_x = x + new_v
18 | 
19 |     if new_x < X_BOUND[0]:
20 |         new_x = X_BOUND[0]
21 |         new_v = 0 
22 |         return ((new_x, new_v), -1), False
23 |     elif new_x > X_BOUND[1]:
24 |         return (state, 10), True
25 |     else:
26 |         new_v = np.clip(new_v, V_BOUND[0], V_BOUND[1])
27 |         return ((new_x, new_v), -1), False
28 | 
29 | 
30 | def state_generator():
31 |     x = np.random.uniform(X_BOUND[0], X_BOUND[1])
32 |     v = np.random.uniform(V_BOUND[0], V_BOUND[1])
33 |     return (x, v)
34 | 
35 | 
36 | iht_s = IHT(1000)
37 | iht_sa = IHT(4096)
38 | 
39 | 
40 | def state_action_aggregator(sa):
41 |     s, a = sa
42 |     x, v = s
43 |     f = np.zeros(4096)
44 |     tile = tiles(iht_sa, 8, [8*x/(0.5+1.2), 8*v/(0.07+0.07)], [a])
45 |     f[tile] = 1
46 |     return f
47 | 
48 | 
49 | def state_aggregator(state):
50 |     x, v = state
51 |     f = np.zeros(1000)
52 |     tile = tiles(iht_s, 8, [8*x/(0.5+1.2), v/(0.07+0.07)])
53 |     f[tile] = 1
54 |     return f
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     vhat = LinearApproximator(fs=1000, basis=state_aggregator)
59 |     qhat = LinearApproximator(fs=4096, basis=state_action_aggregator) 
60 | 
61 |     vqpi_mc, samples_mc = gradient_mc(mountain_car, state_generator, ACTIONS,
62 |         vhat, q_hat=qhat, state_0=(0,0), action_0=0, n_episodes=500, 
63 |         max_steps=1E4, alpha=0.1/8, eps=0.1, optimize=True)
64 |     


--------------------------------------------------------------------------------
/examples/random_walk.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from rl import tdn, alpha_mc
 4 | 
 5 | states = [1,2,3,4,5]
 6 | actions = ['?'] #there are no actions :D
 7 | 
 8 | def random_walk(state, action):
 9 |     go_right = np.random.random() > 0.5 
10 |     if go_right:
11 |         if 1+state <= 5:
12 |             return (1+state, 0), False
13 |         return (state, 1), True
14 |     else:
15 |         if state-1 == 0:
16 |             return (state, 0), True
17 |         return (state-1, 0), False
18 | 
19 | 
20 | _, samples_mc_01 = alpha_mc(states, actions, random_walk, alpha=0.01,
21 |     first_visit=True, n_episodes=200)
22 | 
23 | # ... 


--------------------------------------------------------------------------------
/examples/short_corridor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | plt.style.use('dark_background')
 4 | 
 5 | from tqdm import tqdm 
 6 | 
 7 | from rl import reinforce_mc
 8 | from rl.approximators import ModelFreeTL, LinearApproximator
 9 | 
10 | actions = ['left', 'right']
11 | 
12 | def short_corridor(state, action):
13 |     go_right = (action == 'right')
14 |     if state == 1:
15 |         if go_right:
16 |             return (2, -1), False
17 |         return (1, 0), False
18 |     if state == 2:
19 |         if go_right:
20 |             return (1, -1), False 
21 |         return (3, -1), False
22 |     if state == 3:
23 |         if go_right:
24 |             return (state, 0), True
25 |         return (2, -1), False
26 | 
27 | 
28 | def random_state():
29 |     return np.random.randint(1,4)
30 | 
31 | 
32 | def state_action_aggregator(sa):
33 |     _, a = sa
34 |     right = (a == 'right')
35 |     if right:
36 |         return np.array([1., 0.])
37 |     return np.array([0., 1.])
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     pi_hat = LinearApproximator(fs=2, basis=state_action_aggregator)
42 |     pi_hat.w = np.array([-1.47, 1.47])
43 | 
44 |     pi, samples = reinforce_mc(short_corridor, random_state, pi_hat, actions, state_0=1, alpha=2E-4,
45 |                             gamma=1, n_episodes=1000, max_steps=1000, samples=100, tol=1/np.inf)
46 |     model = ModelFreeTL(short_corridor, random_state, pi, gamma=1)
47 | 
48 |     SMOOTH = 100
49 |     rewards = []
50 |     for i in tqdm(range(SMOOTH)):
51 |         _rewards = []   
52 |         for policy in samples:
53 |             a0 = policy(1)
54 |             episode = model.generate_episode(1, a0, policy=policy, max_steps=100)
55 |             sar = np.array(episode)
56 |             _rewards.append(sar[:,2].astype(int).sum())
57 |         rewards.append(_rewards)
58 |     
59 |     plt.plot(np.array(rewards).mean(axis=0))
60 |     plt.show()
61 | 


--------------------------------------------------------------------------------
/examples/single_state.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | plt.style.use("dark_background")
 4 | 
 5 | from rl import tdn, alpha_mc, off_policy_mc, ModelFreePolicy
 6 | 
 7 | # Define model free
 8 | states = [0]
 9 | actions = ['left', 'right']
10 | 
11 | def single_state_transition(state, action):
12 |     if action == 'right':
13 |         return (state, 0), True
14 |     if action == 'left':
15 |         threshold = np.random.random()
16 |         if threshold > 0.9:
17 |             return (state, 1), True
18 |         else:
19 |             return (state, 0), False
20 | 
21 | b = ModelFreePolicy(actions, states) #by default 1 half
22 | pi = ModelFreePolicy(actions, states)
23 | pi.pi[0] = np.array([1, 0])
24 | 
25 | 
26 | # calculate ordinary and weighted samples state value functions
27 | vqpi_ord, samples_ord = off_policy_mc(states, actions, single_state_transition,
28 |     policy=pi, b=b, ordinary=True, first_visit=True, gamma=1., n_episodes=1E4)
29 | 
30 | vqpi_w, samples_w = off_policy_mc(states, actions, single_state_transition, 
31 |     policy=pi, b=b, ordinary=False, first_visit=True, gamma=1., n_episodes=1E4)
32 | 
33 | 
34 | #Plot!
35 | vords = [v[1].values()[0] for v in samples_ord[1:]]
36 | vw = [v[1].values()[0] for v in samples_w[1:]]
37 | idxs = [v[0] for v in samples_ord[1:]]
38 | 
39 | plt.figure(figsize=(10, 5))
40 | plt.plot(idxs, vords, label='Ordinary Importance Sampling')
41 | plt.plot(idxs, vw, label='Weighted Importance Sampling')
42 | plt.xlabel('No episodes')
43 | plt.ylabel('v(0)')
44 | plt.xscale('log')
45 | plt.legend(loc=1)
46 | plt.show()


--------------------------------------------------------------------------------
/examples/state_aggregation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | plt.style.use("dark_background")
 4 | 
 5 | from rl import gradient_mc, semigrad_tdn
 6 | from rl.approximators import LinearApproximator
 7 | 
 8 | 
 9 | states = [(i//101,i) for i in range(1001)]
10 | actions = ['?'] #there are no actions :D
11 | 
12 | 
13 | def random_walk(state, action):
14 |     group, pos = state
15 |     go_right = np.random.random() > 0.5 
16 |     steps = np.random.randint(1,100)
17 | 
18 |     if go_right:
19 |         if pos+steps <= 1000:
20 |             new_pos = pos+steps
21 |             new_group = new_pos//101
22 |             return ((new_group, new_pos), 0), False
23 |         return (state, 1), True
24 |     else:
25 |         if pos-steps > 0:
26 |             new_pos = pos-steps
27 |             new_group = new_pos//101
28 |             return ((new_group, new_pos), 0), False
29 |         return (state, -1), True
30 |     
31 | 
32 | def state_aggregator(state):
33 |     group, _ = state
34 |     x = np.zeros(10)
35 |     x[group] = 1
36 |     return x
37 | 
38 | 
39 | def state_generator():
40 |     pos = np.random.randint(1,1000)
41 |     group = pos//101
42 |     return (group, pos)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     approximator_mc = LinearApproximator(k=10, fs=10, basis=state_aggregator)
47 |     approximator_td = LinearApproximator(k=10, fs=10, basis=state_aggregator)
48 |     
49 |     vqpi_mc, samples_mc = gradient_mc(random_walk, state_generator, actions, 
50 |                                       approximator_mc, n_episodes=3E4, max_steps=1E5, 
51 |                                       alpha=2*10E-5)
52 |     vqpi_td, samples_td = semigrad_tdn(random_walk, state_generator, actions, 
53 |                                        approximator_td, n_episodes=3E4, max_steps=1E5,
54 |                                        alpha=2*10E-5)
55 |     
56 |     vhat_mc = vqpi_mc[0]
57 |     vhat_td = vqpi_td[0]
58 | 
59 |     state_sample =[(pos//101, pos) for pos in np.arange(1001)]
60 |     vpi_true = 2/1000*np.arange(1001) - 1
61 |     vpi_mc = np.array([vhat_mc(s) for s in state_sample])
62 |     vpi_td = np.array([vhat_td(s) for s in state_sample])
63 | 
64 |     plt.figure(figsize=(10,5))
65 |     plt.plot(vpi_true, label='True value')
66 |     plt.plot(vpi_td, label='semigrad-tdn')
67 |     plt.plot(vpi_mc, label='gradient-mc')
68 |     plt.legend(loc=4)


--------------------------------------------------------------------------------
/examples/windy_gridworld.py:
--------------------------------------------------------------------------------
 1 | from rl import (
 2 |     tdn, 
 3 |     ModelFree, 
 4 |     EpsilonSoftPolicy
 5 | )
 6 | 
 7 | GRID_HEIGHT = 7
 8 | GRID_WIDTH = 10
 9 | WIND_WEIGHT_X = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]
10 | GOAL_XY = (7, 3)
11 | 
12 | states = [(j,i) for i in range(GRID_HEIGHT) for j in range(GRID_WIDTH)]
13 | actions = ['left', 'right', 'up', 'down'] 
14 | 
15 | def windy_grid_world(state, action):
16 |     x, y = state
17 |     if state == GOAL_XY:
18 |         return (state, 1), True
19 |         
20 |     reward = 0
21 |     if action == 'left':
22 |         x = x-1
23 |         y = y + WIND_WEIGHT_X[max(x, 0)]
24 |     if action == 'right':
25 |         x = x+1
26 |         y = y + WIND_WEIGHT_X[min(x, GRID_WIDTH-1)]
27 |     if action == 'up':
28 |         y = y + 1 +  WIND_WEIGHT_X[x]
29 |     if action == 'down':
30 |         y = y - 1 +  WIND_WEIGHT_X[x]
31 |     
32 |     if x < 0:
33 |         x = 0
34 |         reward -= 1
35 |     if x >= GRID_WIDTH:
36 |         x = GRID_WIDTH-1
37 |         reward -= 1
38 |     if y < 0:
39 |         y = 0
40 |         reward -= 1
41 |     if y >= GRID_HEIGHT:
42 |         y = GRID_HEIGHT-1
43 |         reward -= 1
44 |     
45 |     return ((x, y), reward), False
46 | 
47 | vqpi, samples = tdn(states, actions, windy_grid_world, (0,3), 'right', 
48 |     gamma=1, n=1, alpha=0.5, eps=0.1, n_episodes=175, max_steps=3000, optimize=True)
49 | 
50 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | asttokens==2.2.1
 2 | backcall==0.2.0
 3 | certifi==2022.9.24
 4 | charset-normalizer==2.1.1
 5 | click==8.1.3
 6 | click-default-group==1.2.2
 7 | cloup==0.13.1
 8 | colour==0.1.5
 9 | comm==0.1.2
10 | commonmark==0.9.1
11 | contourpy==1.0.6
12 | cycler==0.11.0
13 | debugpy==1.6.6
14 | decorator==5.1.1
15 | executing==1.2.0
16 | fonttools==4.38.0
17 | glcontext==2.3.7
18 | idna==3.4
19 | importlib-metadata==6.0.0
20 | ipykernel==6.21.1
21 | ipython==8.9.0
22 | isosurfaces==0.1.0
23 | jedi==0.18.2
24 | jupyter_client==8.0.2
25 | jupyter_core==5.2.0
26 | kiwisolver==1.4.4
27 | mapbox-earcut==0.12.11
28 | matplotlib==3.6.1
29 | matplotlib-inline==0.1.6
30 | moderngl==5.7.0
31 | moderngl-window==2.4.2
32 | multipledispatch==0.6.0
33 | nest-asyncio==1.5.6
34 | networkx==2.8.8
35 | numpy==1.23.4
36 | packaging==21.3
37 | parso==0.8.3
38 | pexpect==4.8.0
39 | pickleshare==0.7.5
40 | Pillow==9.3.0
41 | platformdirs==2.6.2
42 | prompt-toolkit==3.0.36
43 | psutil==5.9.4
44 | ptyprocess==0.7.0
45 | pure-eval==0.2.2
46 | pydub==0.25.1
47 | pyglet==2.0.0
48 | Pygments==2.13.0
49 | pyparsing==3.0.9
50 | pyrr==0.10.3
51 | python-dateutil==2.8.2
52 | pyzmq==25.0.0
53 | requests==2.28.1
54 | rich==12.6.0
55 | rl==0.0.0
56 | scipy==1.9.3
57 | screeninfo==0.8.1
58 | six==1.16.0
59 | skia-pathops==0.7.3
60 | srt==3.5.2
61 | stack-data==0.6.2
62 | tornado==6.2
63 | tqdm==4.64.1
64 | traitlets==5.9.0
65 | typing_extensions==4.4.0
66 | urllib3==1.26.12
67 | watchdog==2.1.9
68 | wcwidth==0.2.6
69 | zipp==3.12.0
70 | 


--------------------------------------------------------------------------------
/rl/__init__.py:
--------------------------------------------------------------------------------
 1 | from .model_free import (
 2 |     ModelFree, 
 3 |     ModelFreePolicy, 
 4 |     EpsilonSoftPolicy
 5 | )
 6 | from .solvers.model_based import (
 7 |     vq_pi_iter_naive, 
 8 |     value_iteration, 
 9 |     policy_iteration
10 | )
11 | from .solvers.model_free import (
12 |     alpha_mc, 
13 |     tdn, 
14 |     off_policy_mc, 
15 |     n_tree_backup
16 | )
17 | from .solvers.planning import (
18 |     dynaq,
19 |     priosweep,
20 |     t_sampling, 
21 |     mcts, 
22 |     rtdp, 
23 | )
24 | from .solvers.approx import (
25 |     gradient_mc, 
26 |     semigrad_tdn,
27 |     lstd,
28 |     semigrad_td_lambda,
29 |     diff_semigradn,
30 |     reinforce_mc
31 | )
32 | from .tiles import IHT, tiles
33 | 
34 | from .utils import TransitionException
35 | 
36 | __all__ = [
37 |     'ModelFree',
38 |     'ModelFreePolicy',
39 |     'EpsilonSoftPolicy',
40 |     'TransitionException',
41 |     'vq_pi_iter_naive',
42 |     'value_iteration',
43 |     'policy_iteration',
44 |     'alpha_mc',
45 |     'tdn',
46 |     'off_policy_mc',
47 |     'n_tree_backup',
48 |     'dynaq',
49 |     'priosweep',
50 |     't_sampling',
51 |     'mcts',
52 |     'rtdp',
53 |     'gradient_mc',
54 |     'semigrad_tdn',
55 |     'lstd',
56 |     'semigrad_td_lambda',
57 |     'diff_semigradn',
58 |     'reinforce_mc',
59 |     'Tile',
60 |     'tiles'
61 | ]


--------------------------------------------------------------------------------
/rl/approximators.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import time
  3 | from time import perf_counter
  4 | from abc import ABC, abstractmethod
  5 | from typing import (
  6 |     Optional, 
  7 |     Callable, 
  8 |     Tuple, 
  9 |     Callable, 
 10 |     Sequence,
 11 |     Union,
 12 |     List,
 13 |     Any
 14 | )
 15 | 
 16 | import numpy as np
 17 | 
 18 | from rl.utils import (
 19 |     Policy, 
 20 |     Transition,
 21 |     TransitionException,
 22 |     EpisodeStep,
 23 |     W_INIT, 
 24 |     MAX_ITER,
 25 |     MAX_STEPS
 26 | ) 
 27 | 
 28 | '''
 29 | All of this may change if the policy gradient methods are
 30 | similar to this implementation.
 31 | 
 32 | SGD and Semi Gradient Linear methods:
 33 | 
 34 | All Linear methods of this methods involve using a real value
 35 | weight matrix/vector that will be used in conjunction with a
 36 | basis function to approximate the value function.
 37 | 
 38 | wt+1 = wt - 1/2 * alpha * d[(v_pi - v_pi_hat)^2]/dw
 39 | wt+1 = wt + alpha * (v_pi - v_pi_hat)]*d[v_pi_hat]/dw
 40 | wt+1 = wt + alpha * (U - v_pi_hat)]*d[v_pi_hat]/dw
 41 | 
 42 | Since we dont have v_pi we have to use some estimator:
 43 | - MC would imply grabbing full trajectories and using them
 44 | - TD since involves bootstraping (it will be a semigradient method).
 45 | 
 46 | Therefore we generate the most abstract SGD method. The two most 
 47 | important parts of this methods is the U approximation to the real
 48 | value, and the value function approximator, this class should be
 49 | differentiable, or hold a gradient method.
 50 | '''
 51 | 
 52 | 
 53 | class Approximator(ABC):
 54 |     '''Approximator base class that implements caster methods
 55 |     as well as defining the basic interface of any approximator.
 56 |     It has to be updateable and callable. Updatability implies
 57 |     that it can change its inner attributes and hopefully learn. 
 58 |     '''
 59 |     @abstractmethod
 60 |     def __call__(self, s: Any, *args, **kwargs) -> float:
 61 |         '''Return the value of the approximation'''
 62 |         raise NotImplementedError
 63 | 
 64 |     @abstractmethod
 65 |     def update(self, *args, **kwargs) -> Union[None, np.ndarray]:
 66 |         '''Update the approximator'''
 67 |         raise NotImplementedError
 68 | 
 69 |     def copy(self, *args, **kwargs) -> Any:
 70 |         '''Return a copy of the approximator'''
 71 |         return copy.deepcopy(self)
 72 | 
 73 |     def is_differentiable(self):
 74 |         grad = getattr(self, "grad", None)
 75 |         if grad:
 76 |             return True
 77 |         return False
 78 | 
 79 | class ModelFreeTLPolicy(Policy):
 80 |     '''ModelFreeTLPolicy is for approximated methods what 
 81 |     ModelFreePolicy is for tabular methods.
 82 | 
 83 |     This policies are thought with tabular actions in mind, since
 84 |     the problem of continuous action spaces are a topic of ongoing 
 85 |     research and not yet standardized. For each a in the action-space A
 86 |     there will exist an approximator.
 87 |     '''
 88 |     def __init__(self, actions: Sequence[Any], q_hat: Approximator):
 89 |         self.actions = actions
 90 |         self.A = len(actions)
 91 |         self.q_hat = q_hat
 92 |     
 93 |     def update_policy(self, *args, **kwargs):
 94 |         self.q_hat.update(*args, **kwargs)
 95 | 
 96 |     def __call__(self, state: Any):
 97 |         action_idx = np.argmax([self.q_hat((state, a)) for a in self.actions])
 98 |         return self.actions[action_idx]
 99 |         
100 | 
101 | class EpsSoftSALPolicy(ModelFreeTLPolicy):
102 |     def __init__(self, actions: Sequence[Any], q_hat: Approximator,
103 |                  eps: float = 0.1):
104 |         super().__init__(actions, q_hat)
105 |         self.eps = eps
106 | 
107 |     def __call__(self, state):
108 |         if np.random.rand() < self.eps:
109 |             return np.random.choice(self.actions)
110 |         return super().__call__(state)
111 |     
112 | 
113 | class REINFORCEPolicy(ModelFreeTLPolicy):    
114 |     def __init__(self, actions: Sequence[Any], pi_hat: Approximator):
115 |         '''Must be a differential approximator'''
116 |         self.actions = actions
117 |         self.pi_hat = pi_hat
118 |         if not self.pi_hat.is_differentiable():
119 |             raise TypeError("Policy approximator pi_hat must be differentiable")
120 | 
121 |     def grad_lnpi(self, s, a):
122 |         pi_sa = self.pi_sa(s).reshape(-1, 1)
123 |         grad_pi_sa = self.pi_hat.grad((s, a)).reshape(-1, 1)
124 |         grads_pi_sa = np.array([self.pi_hat.grad((s, a_i)) for a_i in self.actions])
125 |         return (grad_pi_sa - grads_pi_sa @ pi_sa).reshape(-1)
126 | 
127 |     def update_policy(self, c: float, s: Any, a: Any):
128 |         self.pi_hat.w += c*self.grad_lnpi(s, a)
129 | 
130 |     def pi_sa(self, s: Any) -> np.ndarray:
131 |         pi_hat_sa = [self.pi_hat((s, a)) for a in self.actions]
132 |         max_sa = max(pi_hat_sa)
133 |         e_hsa = [np.exp(pi_hat_sa[i] - max_sa) for i in range(len(self.actions))]
134 |         denom = sum(e_hsa)
135 |         pi_sa = np.array([e_hsa[i]/denom for i in range(len(self.actions))])        
136 |         return pi_sa
137 | 
138 |     def __call__(self, s: Any) -> float:
139 |         '''default softmax implementation'''
140 |         return np.random.choice(self.actions, p=self.pi_sa(s))
141 | 
142 | 
143 | class ModelFreeTL:
144 |     '''
145 |     ModelFreeTL stands for Model Free Tabular Less, even if we have state,
146 |     to approximate methods what ModelFree is to tabular ones.
147 | 
148 |     ModelFreeTL is used mostly internally for the seek of readability
149 |     on solvers, but can be used standalone as well. The usual case
150 |     for this is when you want to generate arbitrary episodes for a
151 |     specific environment. This class will stand in between of the
152 |     user implemented transitions and the solvers. In difference with 
153 |     tabular ModelFree there is no room for validation previous to 
154 |     runtime executions.
155 |     '''
156 | 
157 |     def __init__(self, transition: Transition, rand_state: Callable,
158 |                  policy: ModelFreeTLPolicy, gamma: float = 1): 
159 |         self.policy = policy
160 |         self.rand_state = rand_state
161 |         self.transition = transition
162 |         self.gamma = gamma
163 |         self._validate_transition()
164 | 
165 |     def _validate_transition(self):
166 |         start = perf_counter()
167 |         while perf_counter() - start < 2:
168 |             rand_s = self.rand_state()
169 |             rand_a = np.random.choice(self.policy.actions)
170 |             try:
171 |                 self.transition(rand_s, rand_a)
172 |             except Exception as e:
173 |                 raise TransitionException(
174 |                     f'Transition function is not valid: {e}')
175 |         
176 |     def random_sa(self):
177 |         a = np.random.choice(self.policy.actions)
178 |         s = self.rand_state()
179 |         return s, a
180 | 
181 |     def generate_episode(self, 
182 |                          s_0: Any, 
183 |                          a_0: Any, 
184 |                          policy: ModelFreeTLPolicy=None, 
185 |                          max_steps: int=MAX_STEPS) -> List[EpisodeStep]:
186 |         '''Generate an episode using given policy if any, otherwise
187 |         use the one defined as the attribute'''
188 |         policy = policy if policy else self.policy
189 |         episode = []
190 |         end = False
191 |         step = 0
192 |         s_t_1, a_t_1 = s_0, a_0
193 |         while (end != True) and (step < max_steps):
194 |             (s_t, r_t), end = self.transition(s_t_1, a_t_1)
195 |             episode.append((s_t_1, a_t_1, r_t))
196 |             a_t = policy(s_t)
197 |             s_t_1, a_t_1 = s_t, a_t
198 |             step += 1
199 | 
200 |         return episode
201 | 
202 |     def step_transition(self, state: Any, action: Any
203 |     ) -> Tuple[Tuple[Any, float], bool]:    
204 |         return self.transition(state, action)
205 |     
206 | 
207 | class SGDWA(Approximator):
208 |     '''Stochastic Gradient Descent Weight-Vector Approximator
209 |     for MSVE (mean square value error).
210 | 
211 |     Differentiable Value Function approximator dependent
212 |     on a weight vector. Must define a gradient method. Thought
213 |     to be less of a general case and more oriented toward the
214 |     mean square value error VE, the prediction objective.
215 |     '''
216 |     def __init__(self, 
217 |                  fs:int=None,
218 |                  basis: Optional[Callable[[Any], np.ndarray]]=None):
219 |         '''
220 |         Parameters
221 |         ----------
222 |         fs: int
223 |             feature shape, i.e. dimensionality of the function basis
224 |         basis: Callable[[Any], np.ndarray], optional
225 |             function basis defaults to identity. If not specified the
226 |             signature must be Callable[[np.ndarray], np.ndarray] otherwise
227 |             it will be probably fail miserably. 
228 |         '''  
229 |         self.fs = fs
230 |         self.basis_name = basis.__name__
231 |         self.basis = basis if basis else lambda x: x
232 |         self.w = np.ones(self.fs)*W_INIT
233 | 
234 |     def grad(self, x: Any) -> np.ndarray:
235 |         '''Return the gradient of the approximation'''
236 |         return self.basis(x)
237 | 
238 |     def delta_w(self, U: float, alpha: float, x: Any, g: np.ndarray) -> np.ndarray:
239 |         '''g: vector value, either gradient or elegibility trace'''
240 |         return alpha * (U - self(x)) * g
241 | 
242 |     def et_update(self, U: float, alpha: float, x: Any, z: np.ndarray) -> np.ndarray:
243 |         '''Updates inplace with elegibility traces the weight vector'''
244 |         dw = self.delta_w(U, alpha, x, z)
245 |         self.w = self.w + dw
246 |         return dw
247 | 
248 |     def update(self, U: float, alpha: float, x: Any) -> np.ndarray:
249 |         '''Updates inplace the weight vector and returns update just in case'''
250 |         dw = self.delta_w(U, alpha, x, self.grad(x))
251 |         self.w = self.w + dw
252 |         return dw
253 | 
254 |     def __call__(self, x):
255 |         return np.dot(self.w, self.basis(x))
256 | 
257 | 
258 | LinearApproximator = SGDWA
259 | 


--------------------------------------------------------------------------------
/rl/armed_bandits.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RL - Copyright © 2023 Iván Belenky @Leculette
  3 | """
  4 | 
  5 | from typing import List
  6 | 
  7 | import numpy as np
  8 | import numpy.random as rnd
  9 | 
 10 | from rl.utils import Policy, RewardGenerator
 11 | 
 12 | 
 13 | GAUSSIAN = [RewardGenerator('normal', rnd.random(), rnd.random()) for _ in range(10)]
 14 | NGAMES = 1
 15 | NSTEPS = 1000
 16 | 
 17 | 
 18 | class EpsilonGreedyBanditPolicy(Policy):
 19 |     def __init__(self, k: int=10, epsilon: float=0.1, offset: float=0.0):
 20 |         self.k = k
 21 |         self.eps = epsilon
 22 |         self.offset = offset
 23 |         self.q_values = np.zeros(k) + self.offset
 24 |         self.N = np.zeros(k)
 25 | 
 26 |     def __call__(self) -> int:
 27 |         if rnd.random() < self.eps:
 28 |             return rnd.randint(self.k)
 29 |         
 30 |         return np.argmax(self.q_values) 
 31 | 
 32 |     def update_policy(self, action: int, reward: float) -> None:
 33 |         N = self.N[action] + 1
 34 |         self.N[action] = N
 35 |         
 36 |         Q = self.q_values[action]
 37 |         R = reward
 38 |         Qnew = Q + 1/N*(R-Q)
 39 | 
 40 |         self.q_values[action] = Qnew
 41 | 
 42 | 
 43 | class UCBPolicy(Policy):
 44 |     def __init__(self, k: int=10, c: float=2.0, offset: float=0.0):
 45 |         self.k = k
 46 |         self.c = c
 47 |         self.offset = offset
 48 |         self.q_values = np.zeros(k) + self.offset
 49 |         self.N = np.zeros(k)
 50 |         self.init_counter = 0
 51 |     
 52 |     def __call__(self):
 53 |         if self.init_counter < self.k:
 54 |             action_index = self.init_counter
 55 |             self.init_counter += 1
 56 |             return action_index
 57 |         
 58 |         return np.argmax(
 59 |             self.q_values + self.c*np.sqrt(np.log(np.sum(self.N))/self.N))
 60 | 
 61 |     def update_policy(self, action, reward):
 62 |         N = self.N[action] + 1
 63 |         self.N[action] = N
 64 |         
 65 |         Q = self.q_values[action]
 66 |         R = reward
 67 |         Qnew = Q + 1/N*(R-Q)
 68 | 
 69 |         self.q_values[action] = Qnew
 70 | 
 71 | 
 72 | class AlphaEpsilonGreedyBanditPolicy(EpsilonGreedyBanditPolicy):
 73 |     def __init__(self, k: int=10, epsilon: int=0.1, alpha: int=0.1):
 74 |         super().__init__(k, epsilon)
 75 |         self.alpha = alpha
 76 |         
 77 |     def update_policy(self, action, reward):
 78 |         Q = self.q_values[action]
 79 |         R = reward
 80 |         Qnew = Q + self.alpha*(R-Q)
 81 | 
 82 |         self.q_values[action] = Qnew
 83 | 
 84 | 
 85 | class GradientPolicy(Policy):
 86 |     def __init__(self, k: int=10, alpha: float=0.1):
 87 |         self.k = k
 88 |         self.alpha = alpha
 89 |         self.rewards = []
 90 |         self.H = np.zeros(k)
 91 |         self.Pr = np.zeros(k)
 92 |     
 93 |     def __call__(self) -> int:
 94 |         self.Pr = np.exp(self.H)/np.sum(np.exp(self.H))
 95 |         return np.random.choice(self.k, p=self.Pr)
 96 | 
 97 |     def update_policy(self, action, reward) -> None:
 98 |         self.H -= self.alpha*(reward - np.mean(self.rewards))*self.Pr
 99 |         self.H[action] += self.alpha*(reward - np.mean(self.rewards))        
100 |         self.rewards.append(reward)
101 | 
102 | 
103 | EGREEDY = EpsilonGreedyBanditPolicy()
104 | 
105 | 
106 | class MultiArmedBandit:
107 |     def __init__(
108 |         self, 
109 |         k: int=10, 
110 |         reward_generators: List[RewardGenerator]=GAUSSIAN, 
111 |         n_games: int=NGAMES,
112 |         policy: Policy=EGREEDY):
113 |         
114 |         self.k = k
115 |         self.reward_generators = reward_generators
116 |         self.N = n_games
117 |         self.histories = []
118 |         self.reward_history = []
119 |         self.action_history = []
120 |         self.policy = policy
121 |         self.ground_truth = np.argmax([
122 |             rg.mean() for rg in self.reward_generators])
123 | 
124 |     def step(self, action: int) -> float:    
125 |         reward = self.reward_generators[action].generate()
126 |         self.reward_history.append(reward)
127 |         self.action_history.append(action)
128 | 
129 |         return reward
130 |     
131 |     def reset(self) -> None:
132 |         self.action_history = []
133 |         self.reward_history = []
134 | 
135 |     def evaluate_policy(self) -> List[float]:
136 |         for _ in range(self.N):
137 |             self.step(self.policy())
138 | 
139 |         return self.reward_history
140 | 
141 |     def update_policy(self) -> None:
142 |         for _ in range(self.N):
143 |             action = self.policy()
144 |             reward = self.step(action)
145 |             self.policy.update_policy(action, reward)
146 | 
147 |     def best_action_percentage(self) -> None:
148 |         ah = np.array(self.action_history)
149 |         n = ah[ah==self.ground_truth]
150 |         return n.shape[0]/ah.shape[0]


--------------------------------------------------------------------------------
/rl/mdp.py:
--------------------------------------------------------------------------------
  1 | '''RL Copyright © 2023 Iván Belenky'''
  2 | 
  3 | from typing import Tuple, List
  4 | from abc import ABC, abstractmethod
  5 | 
  6 | import numpy as np
  7 | 
  8 | from rl.utils import Policy, RewardGenerator
  9 | from rl.solvers.model_based import (
 10 |     vq_pi_iter_naive,
 11 |     policy_iteration,
 12 |     value_iteration
 13 | )
 14 | 
 15 | PROB_TOL = 1E-3
 16 | ESTIMATE_ITERS = int(1E3)
 17 | 
 18 | 
 19 | class MarkovReward(ABC):
 20 |     @abstractmethod
 21 |     def generate(self, state: int, action: int) -> float:
 22 |         raise NotImplementedError
 23 | 
 24 |     @abstractmethod
 25 |     def r_sas(self, next_state: int) -> float:
 26 |         '''
 27 |         r(s,a,s') = E[Rt|St-1 = s, At-1 = a, St = s']
 28 |         '''
 29 | 
 30 |         raise NotImplementedError
 31 | 
 32 |     def r_sa(self, p_s: np.ndarray, state: int, action: int):
 33 |         '''
 34 |         r(s,a) = E[Rt|St-1 = s, At-1 = a]
 35 |         '''
 36 |         p = p_s[state][action]
 37 |         r = 0
 38 |         for i,ps in enumerate(p):            
 39 |             r += ps*self.mean(state=self.states[i])
 40 |         return r 
 41 |         
 42 | 
 43 | class TabularReward(MarkovReward):
 44 |     '''
 45 |     Tabular reward implements as the name suggests a reward
 46 |     per state and action. The reward is a matrix of size SxA.
 47 |     This type of reward is used in the case that the world
 48 |     in which the agent conducts gives you fixed rewards for
 49 |     taking action: a at state: s.
 50 |     '''
 51 | 
 52 |     def __init__(
 53 |         self,
 54 |         r_sa: np.ndarray,
 55 |     ):
 56 |         self.states, self.actions = r_sa.shape
 57 |         self._r_sa = r_sa  
 58 | 
 59 |     def generate(self, state: int = 0, action: int = 0) -> float:
 60 |         return self._r_sa[state][action]
 61 |     
 62 |     def r_sa(self, p_s: np.ndarray, state: int, action: int):
 63 |         return self._r_sa[state][action]
 64 |     
 65 |     def r_sas(self, next_state: int) -> float:
 66 |         return np.mean(self._r_sa[next_state])
 67 | 
 68 | 
 69 | 
 70 | class MarkovPolicy(Policy):
 71 |     '''
 72 |     Markov Policy is a policy that is defined by a matrix of size SxA.
 73 |     This class admits a policy defined by the user or a equally probable
 74 |     policy will be created.
 75 | 
 76 |     The policy matrix π(a|s) must be a matrix of size SxA where each row
 77 |     represents the probability of taking action a at state s. Therefore
 78 |     each row must sum to 1 within the specified tolerance 1E-3.
 79 |     '''
 80 | 
 81 |     def __init__(self, pi_sa: np.ndarray = None, s: int = None, a:int = None):
 82 |         '''
 83 |         pi_sa: policy matrix
 84 |         s: number of states
 85 |         a: number of actions
 86 | 
 87 |         pi_sa and s and a are mutually exclusive. If pi_sa is provided then
 88 |         s and a are ignored. If pi_sa is not provided then s and a must be
 89 |         provided.
 90 |         '''
 91 |         if not pi_sa and not (s or a):
 92 |             raise ValueError("Either pi_sa or s and a must be provided")
 93 | 
 94 |         if pi_sa:
 95 |             self.pi_sa = pi_sa
 96 |             self.s, self.a = self.pi_sa.shape
 97 |             self._validate_attr()
 98 |         else:
 99 |             self.s = s
100 |             self.a = a
101 |             #equal probable policy
102 |             self.pi_sa = np.ones((self.s, self.a))/self.a
103 | 
104 |     def _validate_attr(self):
105 |         if not np.allclose(self.pi_sa.sum(axis=1), 1, atol=PROB_TOL):
106 |             raise ValueError("Each row must sum to 1")
107 | 
108 |     def update_policy(self, q_pi: np.ndarray):
109 |         '''
110 |         Updates the policy based on the Q function: for each state s
111 |         the action a that maximizes Q(s,a) is selected. If there are
112 |         multiple actions that maximize Q(s,a) then the policy is
113 |         updated to be equally probable among those actions.
114 |         '''
115 |         self.pi_sa = np.array([self._update_policy(q_pi, s) 
116 |             for s in range(self.s)])
117 | 
118 |     def _update_policy(self, q_pi: np.ndarray, state: int) -> np.ndarray:
119 |         q_sa = q_pi.T[state]
120 |         max_q = max(q_sa)
121 |         max_q_sa = np.array([q_sa[a] == max_q for a in range(self.a)])
122 |         return max_q_sa / sum(max_q_sa)
123 | 
124 |     def π(self, state: int):
125 |         '''
126 |         π(a|s=state)
127 |         '''
128 |         return self.pi_sa[state] 
129 | 
130 |     def __call__(self, state: int) -> np.ndarray:
131 |         '''
132 |         Collapses the policy to a single action, i.e. a sample from the
133 |         random variable that represents the policy.
134 |         '''
135 |         return np.random.choice(self.pi_sa[state], p=self.pi_sa[state])
136 | 
137 | 
138 | class MDP:
139 |     VQ_PI_SOLVERS = {
140 |         'iter_n': vq_pi_iter_naive
141 |     }
142 | 
143 |     OPTIMAL_POLICY_SOLVERS = {
144 |         'policy_iteration' : policy_iteration,
145 |         'value_iteration' : value_iteration,
146 |     }
147 |     
148 |     def __init__(
149 |         self,
150 |         p_s: np.ndarray,
151 |         states: np.ndarray,
152 |         actions: np.ndarray,
153 |         gamma: float = 0.9,
154 |         policy: Policy = None,
155 |         reward_gen: RewardGenerator = None,
156 |     ):
157 |         self.p_s = p_s
158 |         self.states = states
159 |         self.actions = actions
160 |         self.gamma = gamma
161 |         self.reward_gen = reward_gen
162 |         self.history = []
163 |         self._validate_attr()
164 | 
165 |         self.S = self.states.shape[0]
166 |         self.A = self.actions.shape[0]
167 |         self.policy = policy if policy else MarkovPolicy(s=self.S, a=self.A)
168 | 
169 |     @property
170 |     def cum_return(self) -> float:
171 |         return np.sum([r for _, r in self.history])
172 | 
173 |     @property
174 |     def discounted_return(self) -> float:
175 |         return np.sum(
176 |             [r*(self.gamma**i) for i,(_, r) in enumerate(self.history)])
177 | 
178 |     def _validate_attr(self):
179 |         S = self.states.shape[0]
180 |         A = self.actions.shape[0]
181 |         if self.p_s.shape != (S, A, S):
182 |             raise ValueError(
183 |                 "p_s must be of shape " +
184 |                 f"(n_states, n_actions, n_states) = ({S}, {A}, {S})")
185 |         
186 |         for i in range(S):
187 |             if not np.allclose(self.p_s[i].sum(axis=1), 1, atol=PROB_TOL):
188 |                 raise ValueError("Each row must sum to 1")
189 | 
190 |         if self.gamma > 1 or self.gamma < 0: 
191 |             raise ValueError(
192 |                 f"discounted rate gamma has to be in range [0, 1]")
193 | 
194 |     def r_sa(self, state: int, action: int) -> float:
195 |         return self.reward_gen.r_sa(self.p_s, state, action)
196 |     
197 |     def r_sas(self, next_s: int) -> float:
198 |         return self.reward_gen.r_sas(next_s)
199 | 
200 |     def pi_sa(self, state: int) -> np.ndarray:
201 |         return self.policy.pi_sa(state)        
202 | 
203 |     def vq_pi(
204 |         self, 
205 |         policy: MarkovPolicy = None, 
206 |         method: str = 'iter_n'
207 |         ) -> np.ndarray:
208 |         '''
209 |         Individual state value functions and action-value functions
210 |         vpi and qpi cannot be calculated for bigger problems. That
211 |         constraint will give rise to parametrizations via DL.
212 |         '''
213 |         policy = policy if policy else self.policy
214 |         solver = self.VQ_PI_SOLVERS.get(method)
215 |         if not solver:
216 |             raise ValueError(f"Method {method} does not exist")
217 | 
218 |         return solver(self, policy)
219 | 
220 |     def optimize_policy(
221 |         self, 
222 |         method: str = 'policy_iteration',
223 |         policy: MarkovPolicy = None
224 |         ) -> MarkovPolicy:
225 |         '''
226 |         Optimal policy is the policy that maximizes the expected
227 |         discounted return. It is the policy that maximizes the
228 |         value function for each possible state.
229 |         '''
230 |         policy = policy if policy else self.policy
231 |         solver = self.OPTIMAL_POLICY_SOLVERS.get(method)
232 |         if not solver:
233 |             raise ValueError(f"Method {method} does not exist")
234 |         
235 |         solver(self, policy)
236 |         
237 | 
238 |     def __call__(self, state: int = 0) -> Tuple[int, float]:
239 |         p = self.p_s[state][self.policy(state)]
240 |         next_state = np.random.choice(self.states, p=p)
241 |         self.curr_state = next_state
242 |         reward = self.reward_gen.generate(next_state)
243 | 
244 |         self.history.append((self.curr_state, reward))
245 | 
246 |         return next_state, reward


--------------------------------------------------------------------------------
/rl/model_free.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | RL - Copyright © 2023 Iván Belenky @Leculette
  3 | '''
  4 | 
  5 | from typing import (
  6 |     Tuple, 
  7 |     Union, 
  8 |     Sequence, 
  9 |     Callable,
 10 |     List, 
 11 |     Any, 
 12 |     NewType, 
 13 | )
 14 | 
 15 | import numpy as np
 16 | 
 17 | from rl.utils import (
 18 |     Policy, 
 19 |     State, 
 20 |     Action,
 21 |     StateAction,
 22 |     TransitionException,
 23 |     EpisodeStep, 
 24 |     MAX_ITER, 
 25 |     MAX_STEPS
 26 | )
 27 | 
 28 | class ModelFreePolicy(Policy):
 29 |     def __init__(self, A: Union[Sequence[Any], int], S: Union[Sequence[Any], int]):
 30 |         if not isinstance(A, int):
 31 |             A = len(A)
 32 |         if not isinstance(S, int):
 33 |             S = len(S)
 34 |         self.A = A
 35 |         self.S = S
 36 |         self.pi = np.ones((S, A))/A
 37 | 
 38 |     def __call__(self, state: int):
 39 |         return np.random.choice(self.A, p=self.pi[state])
 40 | 
 41 |     def pi_as(self, action: int, state: int):
 42 |         return self.pi[state, action]
 43 |     
 44 |     def update_policy(self, q, s):
 45 |         qs_mask = (q[s] == np.max(q[s]))
 46 |         self.pi[s] = np.where(qs_mask, 1.0/qs_mask.sum(), 0)
 47 |         
 48 |     def _make_deterministic(self):
 49 |         self.pi = np.eye(self.A)[np.argmax(self.pi, axis=1)]
 50 | 
 51 |         
 52 | class EpsilonSoftPolicy(ModelFreePolicy):
 53 |     def __init__(self, A, S, eps):
 54 |         super().__init__(A, S)
 55 |         self.Ɛ = eps
 56 | 
 57 |     def update_policy(self, q, s):
 58 |         # if there are multiple actions with the same value,
 59 |         # then we choose one of them randomly
 60 |         max_q = np.max(q[s])
 61 |         qs_mask = (q[s] == max_q)
 62 |         self.pi[s] = self.Ɛ/self.A
 63 |         self.pi[s, qs_mask] += (1 - self.Ɛ)/qs_mask.sum()
 64 | 
 65 | 
 66 | class ModelFree:
 67 |     '''
 68 |     ModelFree is the base holder of the states, actions, and 
 69 |     the transition defining an environment.
 70 | 
 71 |     ModelFree is used mostly internally for the seek of readability
 72 |     on solvers, but can be used standalone as well. The usual case
 73 |     for this is when you want to generate arbitrary episodes of a
 74 |     specific environment. This class will stand in between of the
 75 |     user implemented transitions and validate its correct behavior. 
 76 |     '''
 77 | 
 78 |     def __init__(self, states: Sequence[Any], actions: Sequence[Any], 
 79 |         transition: Callable, gamma: float = 1, policy: ModelFreePolicy = None
 80 |     ):
 81 |     
 82 |         self.policy = policy
 83 |         self.states = State(states)
 84 |         self.actions = Action(actions)
 85 |         self.stateaction = StateAction(
 86 |             [(s,a) for s,a in zip(states, actions)])
 87 |         self.transition = transition
 88 |         self.gamma = gamma
 89 |         self.policy = policy if policy else ModelFreePolicy(
 90 |             self.actions.N, self.states.N)
 91 | 
 92 |         self._validate_transition()
 93 |   
 94 |     def init_vq(self):
 95 |         v = np.zeros(self.states.N) 
 96 |         q = np.zeros((self.states.N, self.actions.N))
 97 |         return v,q 
 98 | 
 99 |     def random_sa(self, value=False):
100 |         s = self.states.random(value)
101 |         a = self.actions.random(value)
102 |         return s, a
103 | 
104 |     def _to_index(self, state, action):
105 |         state = self.states.get_index(state)
106 |         action = self.actions.get_index(action)
107 | 
108 |         return state, action
109 | 
110 |     def _validate_transition(self):
111 |         states = self.states.seq
112 |         actions = self.actions.seq
113 |         sa = [(s,a) for s in states for a in actions]
114 |         
115 |         success, fail_count = True, 0
116 |         for s, a in sa:
117 |             try:
118 |                 self.__validate_transition(s, a)
119 |             except Exception as e:
120 |                 success = False
121 |                 fail_count += 1
122 |                 print(f"Warning: {e}") # TODO: change to logger
123 |                 
124 |         if not success:
125 |             raise TransitionException(
126 |                 f"Transition failed for {fail_count} state-action pairs")
127 | 
128 |     def __validate_transition(self, state: Any, action: Any,
129 |         ) -> Tuple[Tuple[Any, Union[float, int]], bool]:
130 |         
131 |         try:
132 |             (s, r), end = self.transition(state, action)
133 |         except Exception as e:
134 |             raise TransitionException(f"Transition method failed: {e}")    
135 |                     
136 |         if not isinstance(end, bool) or not isinstance(r, (float, int)):
137 |             raise TransitionException(
138 |                 "Transition method must return (Any, float), bool"
139 |                 f" instead of ({type(s)}, {type(r)}), {type(end)}"
140 |                 )  
141 |         try:
142 |             self.states.get_index(s)
143 |             self.states.get_index(state)
144 |             self.actions.get_index(action)
145 |         except Exception as e:
146 |             raise TransitionException(
147 |                 f"Undeclared state or action in transition method: {e}")
148 | 
149 |         return (s, r), end
150 | 
151 |     def generate_episode(self, s_0: Any, a_0: Any, policy: ModelFreePolicy = None, 
152 |         max_steps: int=MAX_STEPS) -> List[EpisodeStep]:
153 | 
154 |         policy = policy if policy else self.policy
155 | 
156 |         episode = []
157 |         end = False
158 |         step = 0
159 |         s_t_1, a_t_1 = s_0, a_0
160 |         while (end != True) and (step < max_steps):
161 |             (s_t, r_t), end = self.transition(s_t_1, a_t_1)
162 |             (_s, _a), _r = self._to_index(s_t_1, a_t_1), r_t
163 |             episode.append((_s, _a, _r))
164 |             a_t = policy(self.states.get_index(s_t))
165 |             s_t_1, a_t_1 = s_t, self.actions.from_index(a_t)
166 |             
167 |             step += 1
168 | 
169 |         return episode
170 | 
171 |     def step_transition(self, state: int, action: int
172 |     ) -> Tuple[Tuple[int, float], bool]:
173 |     
174 |         s, a = self.states.from_index(state), self.actions.from_index(action)
175 |         (s_t, r_t), end = self.transition(s, a)
176 |         s_new = self.states.get_index(s_t)
177 |         return (s_new, r_t), end


--------------------------------------------------------------------------------
/rl/solvers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/rl/solvers/__init__.py


--------------------------------------------------------------------------------
/rl/solvers/approx.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractclassmethod
  2 | from copy import deepcopy
  3 | from typing import (
  4 |     Sequence, 
  5 |     Callable, 
  6 |     Tuple, 
  7 |     Optional, 
  8 |     List,
  9 |     Any, 
 10 |     NewType
 11 | )
 12 | 
 13 | import numpy as np
 14 | from numpy.linalg import norm as lnorm
 15 | from tqdm import tqdm
 16 | 
 17 | from rl.approximators import (
 18 |     Approximator, 
 19 |     SGDWA, 
 20 |     ModelFreeTL, 
 21 |     ModelFreeTLPolicy,
 22 |     EpsSoftSALPolicy,
 23 |     REINFORCEPolicy
 24 | )
 25 | from rl.utils import (
 26 |     _typecheck_all,
 27 |     _get_sample_step,
 28 |     _check_ranges,
 29 |     Samples,
 30 |     Transition,
 31 |     MAX_ITER,
 32 |     MAX_STEPS,
 33 |     TOL
 34 | )
 35 | 
 36 | class AVQPi:
 37 |     def __init__(self, v: Approximator, q: Approximator, pi: ModelFreeTLPolicy):
 38 |         self.v_hat = v
 39 |         self.q = q
 40 |         self.pi = pi
 41 | 
 42 | 
 43 | def get_sample(v_hat, q_hat, π, n_episode, optimize):
 44 |     _idx = n_episode
 45 |     _v = v_hat.copy() 
 46 |     _q = None
 47 |     _pi = None
 48 |     if optimize:
 49 |         _pi = deepcopy(π)
 50 |         _q = q_hat.copy()
 51 |     return (_idx, _v, _q, _pi)
 52 | 
 53 | 
 54 | def _set_s0_a0(MFS, s, a):
 55 |     s_0, a_0 = MFS.random_sa()
 56 |     s_0 = s_0 if not s else s
 57 |     a_0 = a_0 if not a else a
 58 |     return s_0, a_0
 59 | 
 60 | 
 61 | def onehot_q_hat(v_hat, actions):
 62 |     '''V(s) function approximator to Q(s,a) function approximator'''
 63 |     A = len(actions)
 64 |     onehot_actions = {a:np.zeros(A-1) for a in actions}
 65 |     for a in range(A-1):
 66 |         onehot_actions[a][a] = 1
 67 | 
 68 |     def new_basis(sa):
 69 |         s, a = sa
 70 |         b_s = v_hat.basis(s)
 71 |         a = onehot_actions[a]
 72 |         b_sa = np.append(b_s, a)
 73 |         return b_sa
 74 | 
 75 |     fs = v_hat.fs + A - 1
 76 |     basis = new_basis
 77 |     
 78 |     q_hat = v_hat.__class__(fs, basis)
 79 |     return q_hat
 80 |     
 81 | 
 82 | def _set_policy(policy, eps, actions, v_hat, q_hat):
 83 |     if not policy:
 84 |         if not q_hat:
 85 |             q_hat = onehot_q_hat(v_hat, actions)
 86 |         if eps:
 87 |             _typecheck_all(constants=[eps])
 88 |             _check_ranges(values=[eps], ranges=[(0,1)])
 89 |             policy = EpsSoftSALPolicy(actions, q_hat, eps=eps)
 90 |         else:
 91 |             policy = ModelFreeTLPolicy(actions, q_hat)
 92 |     return policy
 93 | 
 94 | 
 95 | def gradient_mc(transition: Transition,
 96 |                 random_state: Callable[[Any], Any],
 97 |                 actions: Sequence[Any],
 98 |                 v_hat: SGDWA,
 99 |                 q_hat: SGDWA=None, 
100 |                 state_0: Any=None, 
101 |                 action_0: Any=None, 
102 |                 alpha: float=0.05, 
103 |                 gamma: float=1.0, 
104 |                 n_episodes: int=MAX_ITER, 
105 |                 max_steps: int=MAX_STEPS, 
106 |                 samples: int=1000, 
107 |                 optimize: bool=False,
108 |                 policy: ModelFreeTLPolicy=None, 
109 |                 tol: float=TOL, 
110 |                 eps: float=None) -> Tuple[AVQPi, Samples]:
111 |     '''Gradient α-MC algorithm for estimating, and optimizing policies
112 | 
113 |     gradient_mc uses the gradient of VE to estimate the value of 
114 |     a state given a policy. The work behind estimation runs is to
115 |     the training process of the value function approximator with MC 
116 |     estimates. It can also optimize the policies themselves.
117 |     
118 |     Parameters
119 |     ----------
120 |     transition : Callable[[Any,Any],[[Any,float], bool]]]
121 |         transition must be a callable function that takes as arguments the
122 |         (state, action) and returns (new_state, reward), end.
123 |     random_state : Callable[[Any], Any]
124 |         random state generator
125 |     actions : Sequence[Any]
126 |         Sequence of possible actions
127 |     v_hat : SGDWA
128 |         Function approximator to use for the state value function
129 |     q_hat: SGDWA, optional
130 |         Function approximator to use for the action-value function, by default None
131 |         and will be replaced by a mocked version of q_hat where a one hot 
132 |         encoding for the actions is going to get appended to the state vector.
133 |     state_0 : Any, optional
134 |         Initial state, by default None (random)
135 |     action_0 : Any, optional
136 |         Initial action, by default None (random)
137 |     alpha : float, optional
138 |         Learning rate, by default 0.1
139 |     gamma : float, optional
140 |         Discount factor, by default 0.9
141 |     n_episodes : int, optional
142 |         Number of episodes to simulate, by default 1E4
143 |     max_steps : int, optional
144 |         Maximum number of steps per episode, by default 1E3
145 |     samples : int, optional
146 |         Number of samples to take, by default 1000
147 |     optimize : bool, optional
148 |         Whether to optimize the policy or not, by default False
149 |     policy : ModelFreePolicy, optional
150 |         Policy to use, by default equal probability ModelFreePolicy
151 |     tol : float, optional
152 |         Tolerance for estimating convergence estimations
153 |     eps : float, optional
154 |         Epsilon value for the epsilon-soft policy, by default None (no exploration)
155 |     
156 |     Returns
157 |     -------
158 |     vqpi : Tuple[VPi, QPi, Policy]
159 |         Value function, action-value function, policy and samples if any.
160 |     samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 
161 |         Samples taken during the simulation if any. The first element is the
162 |         index of the iteration, the second is the value function, the third is
163 |         the action-value function and the fourth is the TODO:.
164 | 
165 |     Raises
166 |     ------
167 |     TransitionError: If any of the arguments is not of the correct type.
168 |     '''
169 | 
170 |     policy = _set_policy(policy, eps, actions, v_hat, q_hat)
171 | 
172 |     _typecheck_all(transition=transition,
173 |         constants=[gamma, alpha, n_episodes, max_steps, samples, tol],
174 |         booleans=[optimize], policies=[policy])
175 | 
176 |     _check_ranges(values=[gamma, alpha, n_episodes, max_steps, samples],
177 |         ranges=[(0,1), (0,1), (1,np.inf), (1,np.inf), (1,1001)])
178 | 
179 |     sample_step = _get_sample_step(samples, n_episodes)
180 | 
181 |     model = ModelFreeTL(transition, random_state, policy, gamma=gamma)
182 |     vh, qh, samples = _gradient_mc(model, v_hat, state_0, action_0,
183 |         alpha, int(n_episodes), int(max_steps), tol, optimize, sample_step)
184 | 
185 |     return AVQPi(vh, qh, policy), samples
186 |     
187 | 
188 | def _gradient_mc(MFS, v_hat, s_0, a_0, alpha, n_episodes, 
189 |                  max_steps, tol, optimize, sample_step):
190 | 
191 |     α, γ, π = alpha, MFS.gamma, MFS.policy
192 |     q_hat = π.q_hat
193 | 
194 |     samples, dnorm = [], TOL*2
195 |     for n_episode in tqdm(range(n_episodes), desc=f'grad-MC', unit='episodes'):
196 |         if dnorm < tol:
197 |             break
198 |         s_0, a_0 = _set_s0_a0(MFS, s_0, a_0)
199 | 
200 |         episode = MFS.generate_episode(s_0, a_0, π, max_steps)
201 |         w_old = v_hat.w.copy()
202 | 
203 |         G = 0   
204 |         for s_t, a_t, r_tt in episode[::-1]:
205 |             G = γ*G + r_tt
206 |             v_hat.update(G, α, s_t)
207 |             
208 |             if optimize:
209 |                 q_hat.update(G, α, (s_t, a_t))
210 | 
211 |         dnorm = lnorm(w_old - v_hat.w)
212 | 
213 |         if sample_step and n_episode % sample_step == 0:
214 |             samples.append(get_sample(v_hat, q_hat, π, n_episode, optimize))
215 | 
216 |     return v_hat, q_hat, samples
217 | 
218 | 
219 | def semigrad_tdn(transition: Transition, 
220 |                  random_state: Callable[[Any], Any],
221 |                  actions: Sequence[Any],
222 |                  v_hat: SGDWA,
223 |                  q_hat: SGDWA=None,
224 |                  state_0: Any=None,
225 |                  action_0: Any=None,
226 |                  alpha: float=0.05,
227 |                  n: int=1,
228 |                  gamma: float=1.0,
229 |                  n_episodes: int=MAX_ITER,
230 |                  max_steps: int=MAX_STEPS,
231 |                  samples: int=1000,
232 |                  optimize: bool=False,
233 |                  policy: ModelFreeTLPolicy=None,
234 |                  tol: float=TOL,
235 |                  eps: float=None) -> Tuple[AVQPi, Samples]:
236 |     '''Semi-Gradient n-step Temporal Difference
237 |     
238 |     Solver for the n-step temporal difference algorithm. The algorithm is
239 |     semi-gradient in the sense that it uses a function approximator to
240 |     estimate the _true_ value function. If optimize is set, since no
241 |     encoding of the action into the feature basis is done, the algorithm
242 |     will optimize the policy making one approximator per action. Naive,
243 |     and cost-innefective
244 | 
245 |     Parameters
246 |     ----------
247 |     transition : Callable[[Any,Any],[[Any,float], bool]]]
248 |         transition must be a callable function that takes as arguments the
249 |         (state, action) and returns (new_state, reward), end.
250 |     random_state : Callable[[Any], Any]
251 |         random state generator
252 |     v_hat : SGDWA
253 |         Function approximator to use for the state value function
254 |     q_hat: SGDWA, optional
255 |         Function approximator to use for the action-value function, by default None
256 |         and will be replaced by a mocked version of q_hat where a one hot 
257 |         encoding is going to get appended to the state vector.
258 |     actions: Sequence[Any]
259 |         Sequence of possible actions
260 |     state_0 : Any, optional
261 |         Initial state, by default None (random)
262 |     action_0 : Any, optional
263 |         Initial action, by default None (random)
264 |     alpha : float, optional
265 |         Learning rate, by default 0.1
266 |     n : int, optional
267 |         Number of steps to look ahead, by default 1
268 |     gamma : float, optional
269 |         Discount factor, by default 0.9
270 |     n_episodes : int, optional
271 |         Number of episodes to simulate, by default 1E4
272 |     max_steps : int, optional
273 |         Maximum number of steps per episode, by default 1E3
274 |     samples : int, optional
275 |         Number of samples to take, by default 1000
276 |     optimize : bool, optional
277 |         Whether to optimize the policy or not, by default False
278 |     policy : ModelFreePolicy, optional
279 |         Policy to use, by default equal probability ModelFreePolicy
280 |     tol : float, optional
281 |         Tolerance for estimating convergence estimations
282 |     eps : float, optional
283 |         Epsilon value for the epsilon-soft policy, by default None (no exploration)
284 |     
285 |     Returns
286 |     -------
287 |     vqpi : Tuple[VPi, QPi, Policy]
288 |         Value function, action-value function, policy and samples if any.
289 |     samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 
290 |         Samples taken during the simulation if any. The first element is the
291 |         index of the iteration, the second is the value function, the third is
292 |         the action-value function and the fourth is the TODO:.
293 | 
294 |     Raises
295 |     ------
296 |     TransitionError: If any of the arguments is not of the correct type.
297 |     '''
298 |     policy = _set_policy(policy, eps, actions, v_hat, q_hat)
299 | 
300 |     _typecheck_all(transition=transition,
301 |         constants=[gamma, alpha, n_episodes, max_steps, samples, tol, n],
302 |         booleans=[optimize], policies=[policy])
303 |     
304 |     _check_ranges(values=[gamma, alpha, n_episodes, max_steps, samples, n],
305 |         ranges=[(0,1), (0,1), (1,np.inf), (1,np.inf), (1,1001), (1, np.inf)])
306 | 
307 |     sample_step = _get_sample_step(samples, n_episodes)
308 | 
309 |     model = ModelFreeTL(transition, random_state, policy, gamma=gamma)
310 |     v, q, samples = _semigrad_tdn(model, v_hat, state_0, action_0,
311 |         alpha, n, int(n_episodes), int(max_steps), tol, optimize, sample_step)
312 | 
313 |     return AVQPi(v, q, policy), samples
314 | 
315 | 
316 | def _semigrad_tdn(MFS, v_hat, s_0, a_0, alpha, n, n_episodes, max_steps, 
317 |                   tol, optimize, sample_step):
318 |     '''Semi gradient n-step temporal difference
319 | 
320 |     DRY but clear.
321 |     '''
322 | 
323 |     α, γ, π = alpha, MFS.gamma, MFS.policy
324 |     gammatron = np.array([γ**i for i in range(n)])
325 |     q_hat = π.q_hat
326 | 
327 |     samples, dnorm = [], TOL*2
328 |     for n_episode in tqdm(range(n_episodes), desc=f'semigrad-TD', unit='episodes'):
329 |         if dnorm < tol:
330 |             break
331 |         s, a = _set_s0_a0(MFS, s_0, a_0)
332 | 
333 |         w_old = v_hat.w.copy()
334 | 
335 |         T = int(max_steps)
336 |         R, A, S, G = [], [a], [s], 0 
337 |         for t in range(T):
338 |             if t < T:
339 |                 (s, r), end = MFS.step_transition(s, a)
340 |                 R.append(r)
341 |                 S.append(s)
342 |                 if end:
343 |                     T = t + 1
344 |                 else:
345 |                     a = π(s)
346 |                     A.append(a)
347 |             
348 |             tau = t - n + 1
349 |             if tau >= 0:
350 |                 rr = np.array(R[tau:min(tau+n, T)])
351 |                 G = gammatron[:rr.shape[0]].dot(rr)
352 |                 G_v, G_q = G, G
353 |                 if tau + n < T:
354 |                     G_v = G_v + γ**n * v_hat(S[tau+n])
355 |                     G_q = G_q + γ**n * q_hat((S[tau+n], A[tau+n]))
356 |                 
357 |                 s_t = S[tau]
358 |                 a_t = A[tau]
359 |                 
360 |                 v_hat.update(G_v, α, s_t)
361 | 
362 |                 if optimize:
363 |                     q_hat.update(G_q, α, (s_t, a_t))
364 | 
365 |             if tau == T - 1:
366 |                 break
367 |         
368 |         dnorm = lnorm(w_old - v_hat.w)
369 | 
370 |         if n_episode % sample_step == 0:
371 |             samples.append(get_sample(v_hat, q_hat, π, n_episode, optimize))
372 |         n_episode += 1
373 | 
374 |     return v_hat, q_hat, samples
375 | 
376 | 
377 | # TODO: policy setting and optimize
378 | def lstd(transition: Transition,
379 |          random_state: Callable[[Any], Any],
380 |          state_0: Any=None, 
381 |          action_0: Any=None, 
382 |          alpha: float=0.05, 
383 |          gamma: float=1.0, 
384 |          n_episodes: int=MAX_ITER, 
385 |          max_steps: int=MAX_STEPS, 
386 |          samples: int=1000, 
387 |          optimize: bool=False, 
388 |          policy: ModelFreeTLPolicy=None, 
389 |          tol: float=TOL, eps: float=None) -> Tuple[AVQPi, Samples]:
390 |     '''Least squares n-step temporal differnece
391 |     
392 |     Parameters
393 |     ----------
394 |     transition : Callable[[Any,Any],[[Any,float], bool]]]
395 |         transition must be a callable function that takes as arguments the
396 |         (state, action) and returns (new_state, reward), end.
397 |     random_state: Callable[[Any], Any]
398 |         random state generator
399 |     actions : Sequence[Any]
400 |         Sequence of possible actions
401 |     state_0 : Any, optional
402 |         Initial state, by default None (random)
403 |     action_0 : Any, optional
404 |         Initial action, by default None (random)
405 |     alpha : float, optional
406 |         Learning rate, by default 0.1
407 |     gamma : float, optional
408 |         Discount factor, by default 0.9
409 |     n_episodes : int, optional
410 |         Number of episodes to simulate, by default 1E4
411 |     max_steps : int, optional
412 |         Maximum number of steps per episode, by default 1E3
413 |     samples : int, optional
414 |         Number of samples to take, by default 1000
415 |     optimize : bool, optional
416 |         Whether to optimize the policy or not, by default False
417 |     policy : ModelFreePolicy, optional
418 |         Policy to use, by default equal probability ModelFreePolicy
419 |     tol : float, optional
420 |         Tolerance for estimating convergence estimations
421 |     eps : float, optional
422 |         Epsilon value for the epsilon-soft policy, by default None (no exploration)
423 |     
424 |     Returns
425 |     -------
426 |     vqpi : Tuple[VPi, QPi, Policy]
427 |         Value function, action-value function, policy and samples if any.
428 |     samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 
429 |         Samples taken during the simulation if any. The first element is the
430 |         index of the iteration, the second is the value function, the third is
431 |         the action-value function and the fourth is the TODO:.
432 | 
433 |     Raises
434 |     ------
435 |     TransitionError: If any of the arguments is not of the correct type.
436 |     '''
437 |     
438 |     #policy = _set_policy(policy, eps, actions, approximator)
439 | 
440 |     _typecheck_all(transition=transition,
441 |         constants=[gamma, alpha, n_episodes, max_steps, samples, tol],
442 |         booleans=[optimize], policies=[policy])
443 |     
444 |     _check_ranges(values=[gamma, alpha, n_episodes, max_steps, samples],
445 |         ranges=[(0,1), (0,1), (1,np.inf), (1,np.inf), (1,1001), (1, np.inf)])
446 | 
447 |     sample_step = _get_sample_step(samples, n_episodes)
448 | 
449 |     model = ModelFreeTL(transition, random_state, policy, gamma=gamma)
450 |     v, q, samples = _lstd(model, state_0, action_0,
451 |         alpha, int(n_episodes), int(max_steps), tol, optimize, sample_step)
452 | 
453 |     return AVQPi(v, q, policy), samples
454 | 
455 | 
456 | def _lstd(MF, s_0, a_0, alpha, n_episodes, max_steps, tol, optimize, sample_step):
457 | 
458 |     raise NotImplementedError
459 | 
460 | 
461 | def diff_semigradn(transition: Transition,
462 |                    random_state: Callable[[Any], Any],
463 |                    v_hat: SGDWA,
464 |                    q_hat: SGDWA=None,
465 |                    actions: Sequence[Any]=None,
466 |                    state_0: Any=None,
467 |                    action_0: Any=None,
468 |                    alpha: float=0.1,
469 |                    beta: float=0.1,
470 |                    n: int=1,
471 |                    T: int=1E5,
472 |                    samples: int=1000,
473 |                    optimize: bool=False,
474 |                    policy: ModelFreeTLPolicy=None,
475 |                    tol: float=TOL,
476 |                    eps: float=None) -> Tuple[AVQPi, Samples]:
477 |     '''Differential semi gradient n-step Sarsa for estimation and control.
478 | 
479 |     The average reward setting is one of that comes to solve many problems
480 |     related with discounted settings with function approximation. The average
481 |     reward setting evaluates the quality of a policy by the average rate of reward. 
482 |     That is how good you expect the reward to be in average. 
483 | 
484 |     Parameters
485 |     ----------
486 |     transition : Callable[[Any,Any],[[Any,float], bool]]]
487 |         transition must be a callable function that takes as arguments the
488 |         (state, action) and returns (new_state, reward), end.
489 |     random_state : Callable[[Any], Any]
490 |         random state generator
491 |     v_hat : SGDWA
492 |         Function approximator to use for the state value function
493 |     q_hat: SGDWA, optional
494 |         Function approximator to use for the action-value function, by default None
495 |         and will be replaced by a mocked version of q_hat where a one hot 
496 |         encoding is going to get appended to the state vector.
497 |     actions: Sequence[Any]
498 |         Sequence of possible actions
499 |     state_0 : Any, optional
500 |         Initial state, by default None (random)
501 |     action_0 : Any, optional
502 |         Initial action, by default None (random)
503 |     alpha : float, optional
504 |         Learning rate, by default 0.1
505 |     beta : float, optional
506 |         Step size for average reward updates, by default 0.1
507 |     n : int, optional
508 |         Number of steps to look ahead, by default 1
509 |     T : int, optional
510 |         Number of time steps to simulate, by default 1E5
511 |     samples : int, optional
512 |         Number of samples to take, by default 1000
513 |     optimize : bool, optional
514 |         Whether to optimize the policy or not, by default False
515 |     policy : ModelFreePolicy, optional
516 |         Policy to use, by default equal probability ModelFreePolicy
517 |     tol : float, optional
518 |         Tolerance for estimating convergence estimations
519 |     eps : float, optional
520 |         Epsilon value for the epsilon-soft policy, by default None (no exploration)
521 |     
522 |     Returns
523 |     -------
524 |     vqpi : Tuple[VPi, QPi, Policy]
525 |         Value function, action-value function, policy and samples if any.
526 |     samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 
527 |         Samples taken during the simulation if any. The first element is the
528 |         index of the iteration, the second is the value function, the third is
529 |         the action-value function and the fourth is the Policy.
530 | 
531 |     Raises
532 |     ------
533 |     TransitionError: If any of the arguments is not of the correct type.
534 |     '''
535 |     policy = _set_policy(policy, eps, actions, v_hat, q_hat)
536 | 
537 |     _typecheck_all(transition=transition,
538 |         constants=[alpha, beta, T, samples, tol],
539 |         booleans=[optimize], policies=[policy])
540 | 
541 |     _check_ranges(values=[alpha, beta, T, samples],
542 |         ranges=[(0,1), (0,1), (1,np.inf), (1,1001)])
543 | 
544 |     sample_step = _get_sample_step(samples, T)
545 | 
546 |     model = ModelFreeTL(transition, random_state, policy)
547 |     vh, qh, samples = _diff_semigrad(model, v_hat, state_0, action_0,
548 |         alpha, beta, n, int(T), tol, optimize, sample_step)
549 | 
550 |     return AVQPi(vh, qh, policy), samples    
551 | 
552 | 
553 | def _diff_semigrad(MFS, v_hat, s_0, a_0, alpha, beta, n, T, tol, 
554 |                    optimize, sample_step):
555 |     '''
556 |     DRY but clear. Beta greek letter is written as  
557 |     '''
558 |     α, β, π = alpha, beta, MFS.policy
559 |     q_hat = π.q_hat
560 | 
561 |     samples, dnorm = [], TOL*2
562 |     s, a = _set_s0_a0(MFS, s_0, a_0)
563 | 
564 |     w_old = v_hat.w.copy()
565 | 
566 |     R, A, S, avg_R = [], [a], [s], 0
567 |     for t in tqdm(range(T), desc=f'semigrad-TD', unit='episodes'):
568 |         if dnorm < tol:
569 |             break
570 |         
571 |         (s, r), end = MFS.step_transition(s, a)
572 |         R.append(r)
573 |         S.append(s)
574 |         if end:
575 |             break
576 |         else:
577 |             a = π(s)
578 |             A.append(a)
579 |     
580 |         if t - n + 1 >= 0:
581 |             rr = np.array(R)
582 |             R_R = rr.sum() - avg_R*n
583 |             δ_v = R_R + v_hat(S[n]) - v_hat(S[0])
584 |             δ_q = R_R + q_hat((S[n], A[n])) - q_hat((S[0], A[0]))
585 | 
586 |             avg_R = avg_R + β*δ_q
587 | 
588 |             s_t = S[0]
589 |             a_t = A[0]
590 |             
591 |             v_hat.update(δ_v, α, s_t)
592 |             if optimize:
593 |                 q_hat.update(δ_q, α, (s_t, a_t))
594 | 
595 |             R.pop(0)
596 |             A.pop(0)
597 |             S.pop(0)
598 |     
599 |         dnorm = lnorm(w_old - v_hat.w)
600 | 
601 |         if t % sample_step == 0:
602 |             samples.append(get_sample(v_hat, q_hat, π, t, optimize))
603 | 
604 |     return v_hat, q_hat, samples
605 | 
606 | 
607 | def semigrad_td_lambda(transition: Transition,
608 |                        random_state: Callable,
609 |                        v_hat: SGDWA,
610 |                        q_hat: SGDWA=None,
611 |                        actions: Sequence[Any]=None,
612 |                        state_0: Any=None,
613 |                        action_0: Any=None,
614 |                        alpha: float=0.1,
615 |                        lambdaa: float=0.1,
616 |                        gamma: float=0.9,
617 |                        n_episodes: int=1E5,
618 |                        max_steps: int=1E3,
619 |                        samples: int=1000,
620 |                        optimize: bool=False,
621 |                        policy: ModelFreeTLPolicy=None,
622 |                        tol: float=TOL,
623 |                        eps: float=None) -> Tuple[AVQPi, Samples]:
624 |     '''Semi-gradient TD(λ).
625 | 
626 |     Eligibility traces semi gradient TD(λ). This algorithms extends more 
627 |     generally to TD and MC. It also improves off-line λ-return algorithms following
628 |     the forward view, alas backward view. It updates the weight vector on every step,
629 |     improving sooner, and computations are equally distributed among the time steps.
630 |     Also it can be applied to continuing problems rather than just episodic ones.  
631 | 
632 |     Parameters
633 |     ----------
634 |     transition : Callable[[Any,Any],[[Any,float], bool]]]
635 |         transition must be a callable function that takes as arguments the
636 |         (state, action) and returns (new_state, reward), end.
637 |     random_state : Callable[[Any], Any]
638 |         random state generator
639 |     v_hat : SGDWA
640 |         Function approximator to use for the state value function
641 |     q_hat: SGDWA, optional
642 |         Function approximator to use for the action-value function, by default None
643 |         and will be replaced by a mocked version of q_hat where a one hot 
644 |         encoding is going to get appended to the state vector.
645 |     actions: Sequence[Any]
646 |         Sequence of possible actions
647 |     state_0 : Any, optional
648 |         Initial state, by default None (random)
649 |     action_0 : Any, optional
650 |         Initial action, by default None (random)
651 |     alpha : float, optional
652 |         Learning rate, by default 0.1
653 |     lambdaa : float, optional
654 |         Learning rate, by default 0.1
655 |     gamma : float, optional
656 |         Step size for average reward updates, by default 0.1
657 |     n_episodes : int, optional
658 |         Number of time steps to simulate, by default 1E5
659 |     max_steps : int, optional
660 |         Maximum number of steps per episode, by default 1000
661 |     samples : int, optional
662 |         Number of samples to take, by default 1000
663 |     optimize : bool, optional
664 |         Whether to optimize the policy or not, by default False
665 |     policy : ModelFreePolicy, optional
666 |         Policy to use, by default equal probability ModelFreePolicy
667 |     tol : float, optional
668 |         Tolerance for estimating convergence estimations
669 |     eps : float, optional
670 |         Epsilon value for the epsilon-soft policy, by default None (no exploration)
671 |     
672 |     Returns
673 |     -------
674 |     vqpi : Tuple[VPi, QPi, Policy]
675 |         Value function, action-value function, policy and samples if any.
676 |     samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 
677 |         Samples taken during the simulation if any. The first element is the
678 |         index of the iteration, the second is the value function, the third is
679 |         the action-value function and the fourth is the Policy.
680 | 
681 |     Raises
682 |     ------
683 |     TransitionError: If any of the arguments is not of the correct type.
684 |     '''
685 |     policy = _set_policy(policy, eps, actions, v_hat, q_hat)
686 | 
687 |     _typecheck_all(transition=transition,
688 |         constants=[alpha, gamma, lambdaa, n_episodes, samples, tol],
689 |         booleans=[optimize], policies=[policy])
690 | 
691 |     _check_ranges(values=[alpha, gamma, lambdaa, n_episodes, samples],
692 |         ranges=[(0,1), (0,1), (0,1), (1,np.inf), (1,1001)])
693 | 
694 |     sample_step = _get_sample_step(samples, T)
695 | 
696 |     model = ModelFreeTL(transition, random_state, policy)
697 |     vh, qh, samples = _td_lambda(model, v_hat, state_0, action_0, alpha, 
698 |         lambdaa, int(n_episodes), int(max_steps), tol, optimize, sample_step)
699 | 
700 |     return AVQPi(vh, qh, policy), samples    
701 | 
702 | 
703 | def _td_lambda(MFS, v_hat, s_0, a_0, alpha, lambdaa, n_episodes, max_steps, tol, 
704 |                sample_step, optimize):
705 |     '''DRY but clear.'''
706 |     α, γ, π, λ = alpha, MFS.gamma, MFS.policy, lambdaa
707 |     q_hat = π.q_hat
708 | 
709 |     samples, dnorm = [], TOL*2
710 |     for n_episode in tqdm(range(n_episodes), desc=f'semigrad-TD', unit='episodes'):
711 |         if dnorm < tol:
712 |             break
713 |         s, a = _set_s0_a0(MFS, s_0, a_0)
714 | 
715 |         zv = np.zeros_like(v_hat.w)
716 |         zq = np.zeros_like(q_hat.w)
717 | 
718 |         w_old = v_hat.w.copy()
719 | 
720 |         T = int(max_steps)
721 |         for _ in range(T):
722 |             (s_, r), end = MFS.step_transition(s, a)
723 |             if end:
724 |                 break
725 |             else:
726 |                 a = π(s)
727 |             zv = γ*λ*zv + v_hat.grad(s)
728 |             zq = γ*λ*zq + q_hat.grad((s, a))
729 |             Uv = r + γ*v_hat(s_)
730 |             Uq = r + γ*q_hat(s_, a) 
731 | 
732 |             v_hat.et_update(Uv, α, s, zv)
733 |             
734 |             if optimize:
735 |                 q_hat.et_update(Uq, α, (s, a), zq)
736 | 
737 |             s = s_
738 | 
739 |         dnorm = lnorm(w_old - v_hat.w)
740 |             
741 |         if n_episode % sample_step == 0:
742 |             samples.append(get_sample(v_hat, q_hat, π, n_episode, optimize))
743 |         n_episode += 1
744 | 
745 |     return v_hat, q_hat, samples
746 | 
747 | 
748 | def reinforce_mc(transition: Transition,
749 |                  random_state: Callable,
750 |                  pi_hat: Approximator,
751 |                  actions: Sequence[Any]=None,
752 |                  state_0: Any=None,
753 |                  action_0: Any=None,
754 |                  alpha: float=0.1,
755 |                  gamma: float=0.9,
756 |                  n_episodes: int=MAX_ITER,
757 |                  max_steps: int=MAX_STEPS,
758 |                  samples: int=1000,
759 |                  policy: REINFORCEPolicy=None,
760 |                  tol: float=TOL) -> Tuple[REINFORCEPolicy, List[REINFORCEPolicy]]:
761 |     '''MC Policy-Gradient control algorithm
762 | 
763 |     This algorithm must be used with differentiable policies. Regardless of the 
764 |     approximator the parameter for the latter is optimized via SGD. For more
765 |     information check p.328 chapter 13.3.
766 | 
767 |     Parameters
768 |     ----------
769 |     transition : Callable[[Any,Any],[[Any,float], bool]]]
770 |         transition must be a callable function that takes as arguments the
771 |         (state, action) and returns (new_state, reward), end.
772 |     random_state : Callable[[Any], Any]
773 |         random state generator
774 |     pi_hat : SGDWA
775 |         Function approximator to use for the state value function. Wont be 
776 |         used even if specified if policy is provided
777 |     actions: Sequence[Any]
778 |         Sequence of possible actions
779 |     state_0 : Any, optional
780 |         Initial state, by default None (random)
781 |     action_0 : Any, optional
782 |         Initial action, by default None (random)
783 |     alpha : float, optional
784 |         Learning rate, by default 0.1
785 |     gamma : float, optional
786 |         Step size for average reward updates, by default 0.1
787 |     n_episodes : int, optional
788 |         Number of time steps to simulate, by default 1E5
789 |     max_steps : int, optional
790 |         Maximum number of steps per episode, by default 1000
791 |     samples : int, optional
792 |         Number of samples to take, by default 1000
793 |     policy : ModelFreePolicy, optional
794 |         Policy to use, by default equal probability ModelFreePolicy
795 |     tol : float, optional
796 |         Tolerance for estimating convergence estimations
797 |     
798 |     Returns
799 |     -------
800 |     pi : REINFORCEPolicy
801 |         Value function, action-value function, policy and samples if any.
802 |     samples : List[REINFORCEPolicy] 
803 |         Samples taken during the simulation if any for the differentiable policy.
804 | 
805 |     Raises
806 |     ------
807 |     TransitionError: If any of the arguments is not of the correct type.
808 |     '''
809 |     policy = policy if policy else REINFORCEPolicy(actions, pi_hat)
810 | 
811 |     _typecheck_all(transition=transition, constants=[alpha, gamma, n_episodes, samples, tol], 
812 |                    policies=[policy])
813 | 
814 |     _check_ranges(values=[alpha, gamma, n_episodes, samples],
815 |         ranges=[(0,1), (0,1), (1,np.inf), (1,1001)])
816 | 
817 |     sample_step = _get_sample_step(samples, n_episodes)
818 | 
819 |     model = ModelFreeTL(transition, random_state, policy)
820 |     pi, samples = _reinforce_mc(model, state_0, action_0, alpha, 
821 |         int(n_episodes), int(max_steps), tol, sample_step)
822 | 
823 |     return pi, samples 
824 | 
825 | 
826 | def _reinforce_mc(MFS, s_0, a_0, alpha, n_episodes, max_steps, tol, sample_step):
827 |     ''''not returning the usual sample set'''
828 |     α, γ, π = alpha, MFS.gamma, MFS.policy
829 |     gammatron = np.array([γ**i for i in range(max_steps)])
830 |     samples, dnorm = [], TOL*2
831 |     for n_episode in tqdm(range(n_episodes), desc=f'MC Policy Gradient', unit='episodes'):
832 |         s, a = _set_s0_a0(MFS, s_0, a_0)
833 |         theta_old = deepcopy(π.pi_hat.w)
834 |         episode = MFS.generate_episode(s, a, π, max_steps)
835 |         rr = np.array([r for _, _, r in episode])
836 |         for t, (s, a, _) in enumerate(episode):
837 |             G = gammatron[:len(episode)-t].dot(rr[t:])
838 |             c = α*G*(γ**t)
839 |             π.update_policy(c, s, a)
840 |         
841 |         if n_episode % sample_step == 0:
842 |             samples.append(deepcopy(π))
843 | 
844 |         dnorm = lnorm(π.pi_hat.w - theta_old)
845 |         if dnorm < tol:
846 |             break
847 | 
848 |     return π, samples
849 |         


--------------------------------------------------------------------------------
/rl/solvers/model_based.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | 
  3 | import numpy as np
  4 | from numpy.linalg import norm as lnorm
  5 | 
  6 | from rl.utils import (
  7 |     Policy,
  8 |     _typecheck_all,
  9 |     _get_sample_step,
 10 |     _check_ranges,
 11 |     VQPi,
 12 |     Samples,
 13 |     Transition,
 14 |     Vpi,
 15 |     Qpi,
 16 |     PQueue,
 17 |     MAX_ITER,
 18 |     MAX_STEPS,
 19 |     TOL
 20 | ) 
 21 | 
 22 | #TODO: refactor, docs
 23 | 
 24 | def get_sample(MDP, v, q, π, n_iter):
 25 |     _idx = n_iter
 26 |     # TODO: refactor, there is no states tabular index here
 27 |     # and there is not stateaction
 28 |     _v, _q = Vpi(v.copy(), MDP.states), Qpi(q.copy(), MDP.stateaction)
 29 |     _pi = None
 30 |     return (_idx, _v, _q, _pi)
 31 | 
 32 | 
 33 | # There are in-place methods optional. That is, there is actually an
 34 | # inplace sweep of all states instantaneously, through the vectorization
 35 | # of the update equations for DP methods. Should be Faster to execute and
 36 | # slower to converge. But tests must be carried out to verify this claim.
 37 | 
 38 | 
 39 | def vq_pi_iter_naive(MDP, policy: Policy, tol: float=TOL, inplace=False, 
 40 |     max_iters: int=MAX_STEPS) -> Tuple[VQPi, Samples]:
 41 | 
 42 |     sample_step = _get_sample_step(samples, max_iters//10) # RULE OF THUMB
 43 | 
 44 |     v, q, samples = _vq_pi_iter_naive(MDP, policy, tol, max_iters, inplace,
 45 |         sample_step)
 46 |     
 47 |     return VQPi((v, q, policy)), samples
 48 | 
 49 | 
 50 | def _inplace_step_pe(MDP, vᵢ, _, π_sa, r_sa, p_s, γ):
 51 |     for s in range(MDP.S):
 52 |         vᵢ[s] = np.dot(π_sa[s], r_sa[:,s])
 53 |         vᵢ[s] += γ * np.dot(p_s[s] @ vᵢ, π_sa[s])
 54 |     return vᵢ
 55 | 
 56 | 
 57 | def _naive_step_pe(_, vᵢ, vᵢ_1, π_sa, r_sa, p_s, γ):
 58 |     vᵢ = np.diag(π_sa @ r_sa)
 59 |     vᵢ = vᵢ + γ * np.diag((p_s @ vᵢ_1) @ π_sa.T)
 60 |     return vᵢ
 61 | 
 62 | 
 63 | # pe: policy evaluation
 64 | ITER_NAIVE_STEP_MAP = {
 65 |     'inplace': _inplace_step_pe,
 66 |     'naive': _naive_step_pe
 67 | }
 68 | 
 69 | 
 70 | def _vq_pi_iter_naive(MDP, policy, tol, max_iters, inplace, sample_step):
 71 |     γ = MDP.gamma
 72 |     p_s = MDP.p_s
 73 | 
 74 |     vᵢ = np.ones(MDP.S)
 75 |     diff_norm = TOL*2
 76 | 
 77 |     update_step = ITER_NAIVE_STEP_MAP['inplace' if inplace else 'naive']
 78 | 
 79 |     π_sa = np.array([policy.π(s) for s in range(MDP.S)]) #SxA
 80 |     r_sa  = np.array([[MDP.r_sa(s,a) for s in range(MDP.S)]
 81 |         for a in range(MDP.A)]) #AxS
 82 |     
 83 |     n_iter, samples = 0, []
 84 |     while (n_iter < max_iters) and (diff_norm > tol):
 85 |         vᵢ_1 = vᵢ.copy()
 86 |         vᵢ = update_step(MDP, vᵢ, vᵢ_1, π_sa, r_sa, p_s, γ)
 87 |         diff_norm = lnorm(vᵢ - vᵢ_1)
 88 |         n_iter += 1
 89 |         
 90 |         if n_iter % sample_step == 0:
 91 |             samples.append(get_sample(MDP, vᵢ, None, policy, n_iter))
 92 | 
 93 |     vπ = vᵢ
 94 |     qπ = r_sa + (p_s @ vπ).T
 95 | 
 96 |     return vπ, qπ
 97 | 
 98 | 
 99 | def policy_iteration(MDP, policy: Policy, tol_eval: float = TOL,
100 |     max_iters_eval: int = MAX_ITER, tol_opt: float = TOL,
101 |     max_iters_opt: int = MAX_ITER, samples: int=1000
102 |     ) -> Tuple[VQPi, Samples]:
103 | 
104 |     vᵢ_1, q_i_1 = vq_pi_iter_naive(MDP, policy, tol_eval, max_iters_eval)
105 |     vᵢ, q_i = vᵢ_1.copy(), q_i_1.copy()
106 | 
107 |     diff_norm = 2*tol_opt
108 | 
109 |     n_iter = 0
110 | 
111 |     while (n_iter < max_iters_opt) and (diff_norm > tol_opt):
112 |         vᵢ_1 = vᵢ.copy()
113 |         q_i_1 = q_i.copy()
114 | 
115 |         policy.update_policy(q_i_1)
116 |         vᵢ, q_i = vq_pi_iter_naive(MDP, policy, tol_eval, max_iters_eval)
117 |         
118 |         n_iter += 1 
119 |         diff_norm = lnorm(vᵢ - vᵢ_1)
120 |     
121 |     return vᵢ, q_i, samples
122 | 
123 | 
124 | def _inplace_step_vi(MDP, vᵢ, _, r_sa, p_s, γ):
125 |     for s in range(MDP.S):
126 |         vᵢ[s] = np.max(r_sa[:,s] + γ * (p_s[s] @ vᵢ))
127 |     return vᵢ, None
128 | 
129 | 
130 | def _naive_step_vi(_, vᵢ, vᵢ_1, r_sa, p_s, γ):
131 |     qᵢ = r_sa + γ * (p_s @ vᵢ_1).T
132 |     vᵢ = np.max(qᵢ, axis=0)
133 |     return vᵢ, qᵢ
134 | 
135 | 
136 | VALUE_ITERATION_STEP_MAP = {
137 |     'inplace': _inplace_step_vi,
138 |     'naive': _naive_step_vi
139 | }
140 | 
141 | 
142 | def value_iteration(MDP, policy: Policy = None, inplace: bool=False, 
143 |     tol: float = TOL, max_iters: int=MAX_ITER) -> Tuple[VQPi, Samples]:
144 | 
145 |     sample_step = _get_sample_step(samples, max_iters//10) # RULE OF THUMB
146 | 
147 |     v, q, samples = _value_iteration(MDP, policy, tol, max_iters, inplace,
148 |         sample_step)
149 | 
150 |     return VQPi((v, q, policy)), samples
151 | 
152 | 
153 | def _value_iteration(MDP, policy, tol, max_iters, inplace, sample_step):
154 |     policy = policy if policy else MDP.policy
155 | 
156 |     γ = MDP.gamma
157 |     p_s = MDP.p_s
158 | 
159 |     vᵢ = np.ones(MDP.S)
160 |     diff_norm = TOL*2
161 |     
162 |     update_step = VALUE_ITERATION_STEP_MAP['inplace' if inplace else 'naive']
163 | 
164 |     r_sa  = np.array([[MDP.r_sa(s,a) for s in range(MDP.S)]   
165 |         for a in range(MDP.A)]) #AxS
166 | 
167 |     n_iter, samples = 0, []
168 |     while (n_iter < max_iters) and (diff_norm > tol):
169 |         vᵢ_1 = vᵢ.copy()        
170 |         vᵢ, qᵢ = update_step(MDP, vᵢ, vᵢ_1, r_sa, p_s, γ)
171 |         diff_norm = lnorm(vᵢ - vᵢ_1)
172 |         n_iter += 1
173 | 
174 |         if n_iter % sample_step == 0:
175 |             samples.append(get_sample(MDP, vᵢ, qᵢ, policy, n_iter))
176 | 
177 |     policy.update_policy(qᵢ)


--------------------------------------------------------------------------------
/rl/solvers/model_free.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RL - Copyright © 2023 Iván Belenky @Leculette
  3 | """
  4 | 
  5 | from typing import (
  6 |     Tuple, 
  7 |     Sequence,  
  8 |     Any
  9 | )
 10 | 
 11 | from tqdm import tqdm
 12 | import numpy as np
 13 | from numpy.linalg import norm as lnorm
 14 | 
 15 | from rl.model_free import (
 16 |     ModelFree,
 17 |     ModelFreePolicy,
 18 |     EpsilonSoftPolicy
 19 | )
 20 | from rl.utils import (
 21 |     Policy,
 22 |     _typecheck_all,
 23 |     _get_sample_step,
 24 |     _check_ranges,
 25 |     VQPi,
 26 |     Samples,
 27 |     Transition,
 28 |     Vpi,
 29 |     Qpi,
 30 |     PQueue,
 31 |     MAX_ITER,
 32 |     MAX_STEPS,
 33 |     TOL
 34 | ) 
 35 | 
 36 | 
 37 | def get_sample(MF, v, q, π, n_episode, optimize):
 38 |     _idx = n_episode
 39 |     _v, _q = Vpi(v.copy(), MF.states), Qpi(q.copy(), MF.stateaction)
 40 |     _pi = None
 41 |     if optimize:
 42 |         _pi = ModelFreePolicy(MF.actions.N, MF.states.N)
 43 |         _pi.pi = π.pi.copy()
 44 |     return (_idx, _v, _q, _pi)
 45 | 
 46 | 
 47 | def _set_s0_a0(MF, s, a):
 48 |     s_0, a_0 = MF.random_sa()
 49 |     s_0 = s_0 if not s else s
 50 |     a_0 = a_0 if not a else a
 51 |     return s_0, a_0
 52 | 
 53 | 
 54 | def _set_policy(policy, eps, actions, states):
 55 |     if not policy and eps:
 56 |         _typecheck_all(constants=[eps])
 57 |         _check_ranges(values=[eps], ranges=[(0,1)])
 58 |         policy = EpsilonSoftPolicy(actions, states, eps=eps)
 59 |     elif not policy:
 60 |         policy = ModelFreePolicy(actions, states)
 61 |     
 62 |     return policy
 63 |     
 64 | 
 65 | def alpha_mc(states: Sequence[Any], actions: Sequence[Any], transition: Transition,
 66 |     gamma: float=0.9, alpha: float=0.05, use_N :bool=False, first_visit: bool=True,
 67 |     exploring_starts: bool=True, n_episodes: int=MAX_ITER, max_steps: int=MAX_STEPS,
 68 |     samples: int=1000, optimize: bool=False, policy: ModelFreePolicy=None, 
 69 |     eps: float=None) -> Tuple[VQPi, Samples]:
 70 |     '''α-MC state and action-value function estimation, policy optimization
 71 | 
 72 |     Alpha weighted Monte Carlo state and action-value function estimation, policy
 73 |     optimization. By setting use_N to True, it will use the classical weighting 
 74 |     schema, utilizing N(s) instead of a contstant α. 
 75 | 
 76 |     Parameters
 77 |     ----------
 78 |     states : Sequence[Any]
 79 |     actions : Sequence[Any]
 80 |     transition : Callable[[Any,Any],[[Any,float], bool]]]
 81 |         transition must be a callable function that takes as arguments the
 82 |         (state, action) and returns (new_state, reward), end.
 83 |     gamma : float, optional
 84 |         Discount factor, by default 0.9
 85 |     alpha : float, optional
 86 |         Learning rate, by default 0.1
 87 |     use_N : bool, optional
 88 |         If true, it will use 1/N(s) (number of visits) instead of α
 89 |     first_visit : bool, optional
 90 |         If true, it will only use the first visit to a state, by default True
 91 |     exploring_starts : bool, optional
 92 |         Random action at the start of each episode. 
 93 |     n_episodes : int, optional
 94 |         Number of episodes to simulate, by default 1E4
 95 |     max_steps : int, optional
 96 |         Maximum number of steps per episode, by default 1E3
 97 |     samples : int, optional
 98 |         Number of samples to take, by default 1000
 99 |     optimize : bool, optional
100 |         Whether to optimize the policy or not, by default False
101 |     policy : ModelFreePolicy, optional
102 |         Policy to use, by default equal probability ModelFreePolicy
103 |     eps : float, optional
104 |         Epsilon for the EpsilonSoftPolicy, by default None (no exploration)
105 | 
106 |     Returns
107 |     -------
108 |     vqpi : Tuple[VPi, QPi, Policy]
109 |         Value function, action-value function, policy and samples if any.
110 |     samples : Tuple[int, Vpi, Qpi, Policy]
111 |         Samples taken during the simulation if any. The first element is the
112 |         index of the iteration, the second is the value function, the third is
113 |         the action-value function and the fourth is the policy until
114 |         optimization point idx.
115 | 
116 |     Raises
117 |     ------
118 |     TransitionException: transition calls function checks.
119 |     '''
120 |     policy = _set_policy(policy, eps, actions, states)
121 | 
122 |     _typecheck_all(tabular_idxs=[states, actions],transition=transition,
123 |         constants=[gamma, alpha, n_episodes, max_steps, samples],
124 |         booleans=[use_N, first_visit, exploring_starts, optimize],
125 |         policies=[policy])
126 | 
127 |     _check_ranges(values=[gamma, alpha, n_episodes, max_steps, samples],
128 |         ranges=[(0,1), (0,1), (1,np.inf), (1,np.inf), (1,1001)])
129 | 
130 |     sample_step = _get_sample_step(samples, n_episodes)
131 | 
132 |     model = ModelFree(states, actions, transition, gamma=gamma, policy=policy)    
133 |     v, q, samples = _visit_monte_carlo(model, first_visit, exploring_starts, use_N,
134 |         alpha, int(n_episodes), max_steps, optimize, sample_step) 
135 | 
136 |     return VQPi((v, q, model.policy.pi)), samples
137 | 
138 | 
139 | def _mc_step(v, q, t, s_t, a_t, s, a, n_s, n_sa, G, first_visit):
140 |     if s_t not in s[:-(t+1)] or not first_visit:
141 |         n_s[s_t] = n_s[s_t] + 1
142 |         v[s_t] = v[s_t] + (G - v[s_t])/n_s[s_t]
143 |     
144 |     q_key = (s_t, a_t)
145 |     if q_key not in zip(s[:-(t+1)],a[:-(t+1)]) or not first_visit:    
146 |         n_sa[q_key] = n_sa[q_key] + 1
147 |         q[q_key] = q[q_key] + (G - q[q_key])/n_sa[q_key]
148 |         return True
149 |     
150 |     return False
151 | 
152 | 
153 | def _mc_step_α(v, q, t, s_t, a_t, s, a, α, G, first_visit):
154 |     if s_t not in s[:-(t+1)] or not first_visit:
155 |         v[s_t] = v[s_t] + α*(G - v[s_t])
156 |     
157 |     q_key = (s_t, a_t)
158 |     if q_key not in zip(s[:-(t+1)],a[:-(t+1)]) or not first_visit:    
159 |         q[q_key] = q[q_key] + α*(G - q[q_key])
160 |         return True
161 |     
162 |     return False
163 | 
164 | 
165 | def _visit_monte_carlo(MF, first_visit, exploring_starts, use_N, alpha, 
166 |     n_episodes, max_steps, optimize, sample_step):
167 |     
168 |     π = MF.policy
169 |     γ = MF.gamma
170 |     α = alpha
171 | 
172 |     samples = []
173 | 
174 |     v, q = np.zeros(MF.states.N), np.zeros((MF.states.N, MF.actions.N))
175 |     if use_N:
176 |         n_s, n_sa = np.zeros(MF.states.N), np.zeros((MF.states.N, MF.actions.N))
177 | 
178 |     s_0, a_0 = MF.random_sa(value=True)
179 | 
180 |     for n_episode in tqdm(range(n_episodes), desc='Monte Carlo', unit='episodes'):
181 |         if exploring_starts:
182 |             s_0, a_0 = MF.random_sa(value=True)
183 | 
184 |         episode = MF.generate_episode(s_0, a_0, π, max_steps)
185 |         sar = np.array(episode)
186 |         s, a, _ = sar.T
187 |         
188 |         G = 0   
189 |         for t, (s_t, a_t, r_tt) in enumerate(sar[::-1]):
190 |             s_t, a_t = int(s_t), int(a_t)
191 |             G = γ*G + r_tt
192 |             if use_N:
193 |                 update = _mc_step(v, q, t, s_t, a_t, s, a, n_s,
194 |                     n_sa, G, first_visit)
195 |             else:
196 |                 update = _mc_step_α(v, q, t, s_t, a_t, s, a,
197 |                     α, G, first_visit)
198 |             if optimize and update:
199 |                 π.update_policy(q, s_t)
200 | 
201 |         if sample_step and n_episode % sample_step == 0:
202 |             samples.append(get_sample(MF, v, q, π, n_episode, optimize))
203 | 
204 |     return v, q, samples
205 | 
206 | 
207 | def off_policy_mc(states: Sequence[Any], actions: Sequence[Any], transition: Transition,
208 |     gamma: float=0.9, first_visit: bool=True, ordinary: bool=False,  
209 |     n_episodes: int=MAX_ITER, max_steps: int=MAX_STEPS, samples: int=1000, 
210 |     optimize: bool=False, policy: ModelFreePolicy=None, eps: float=None, 
211 |     b: ModelFreePolicy=None) -> Tuple[VQPi, Samples]: 
212 |     '''Off-policy Monte Carlo state and action value function estimation, policy 
213 |     
214 |     Off policy Monte Carlo method for estimating state and action-value functtions
215 |     as well as optimizing policies. If no behavior policy is provided an 
216 |     equal probability one for each (s,a) pair will be used. In order to guarantee
217 |     convergence you must specify 
218 | 
219 |     Parameters
220 |     ----------
221 |     states : Sequence[Any]
222 |     actions : Sequence[Any]
223 |     transition : Callable[[Any,Any],[[Any,float], bool]]]
224 |         transition must be a callable function that takes as arguments the
225 |         (state, action) and returns (new_state, reward), end.
226 |     gamma : float, optional
227 |         Discount factor, by default 0.9
228 |     first_visit : bool, optional
229 |         If true, it will only use the first visit to a state, by default True
230 |     ordinary : bool, optional
231 |         ordinary sampling, beware! high variance, by default False
232 |     n_episodes : int, optional
233 |         Number of episodes to simulate, by default 1E4
234 |     max_steps : int, optional
235 |         Maximum number of steps per episode, by default 1E3
236 |     samples : int, optional
237 |         Number of samples to take, by default 1000
238 |     optimize : bool, optional
239 |         Whether to optimize the policy or not, by default False
240 |     policy : ModelFreePolicy, optional
241 |         Policy to use, by default equal probability ModelFreePolicy
242 |     eps : float, optional
243 |         Epsilon for the EpsilonSoftPolicy, by default None (no exploration)
244 |     b : ModelFreePolicy, optional
245 |         Behavior policy, by default None (equal probability ModelFreePolicy)
246 | 
247 |     Returns
248 |     -------
249 |     vqpi : Tuple[VPi, QPi, Policy]
250 |         Value function, action-value function, policy and samples if any.
251 |     samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 
252 |         Samples taken during the simulation if any. The first element is the
253 |         index of the iteration, the second is the value function, the third is
254 |         the action-value function and the fourth is the TODO:.
255 | 
256 |     Raises
257 |     ------
258 |     TransitionException: transition calls function checks.
259 |     '''
260 |     
261 |     policy = _set_policy(policy, eps, actions, states)
262 |     if not b:
263 |         b = ModelFreePolicy(actions, states)
264 | 
265 |     _typecheck_all(tabular_idxs=[states, actions],transition=transition,
266 |         constants=[gamma, n_episodes, max_steps, samples],
267 |         booleans=[first_visit, optimize],
268 |         policies=[policy, b])
269 |     _check_ranges(values=[gamma, n_episodes, max_steps, samples],
270 |         ranges=[(0,1), (1,np.inf), (1,np.inf), (1,1001)])
271 | 
272 |     sample_step = _get_sample_step(samples, n_episodes)
273 | 
274 |     model = ModelFree(states, actions, transition, gamma=gamma, policy=policy)    
275 |     v, q, samples = _off_policy_monte_carlo(model, b, int(n_episodes), 
276 |         max_steps, first_visit, ordinary, optimize, sample_step)
277 | 
278 |     return VQPi((v, q, policy)), samples
279 | 
280 | 
281 | def _mc_step_off(q, v, t, s_t, a_t, s, a, G, w, c, c_q, 
282 |     first_visit, ordinary):
283 |     
284 |     c_add = 1 if ordinary else w
285 |     denom = w if ordinary else 1    
286 | 
287 |     if s_t not in s[:-(t+1)] or not first_visit:
288 |         c[s_t] = c[s_t] + c_add
289 |         if w < 1E-10:
290 |             if ordinary:
291 |                 v[s_t] = v[s_t] - 1/c[s_t] * v[s_t]
292 |         else:
293 |             v[s_t] = v[s_t] + w/c[s_t] * (G - v[s_t]/denom)
294 |         
295 |     q_key = (s_t, a_t)
296 |     if q_key not in zip(s[:-(t+1)],a[:-(t+1)]) or not first_visit:
297 |         c_q[q_key] = c_q[q_key] + c_add
298 |         if w < 1E-10:
299 |             if ordinary:
300 |                 q[q_key] = q[q_key] - 1/c_q[q_key] * q[q_key]
301 |         else:
302 |             q[q_key] = q[q_key] + w/c_q[q_key] * (G - q[q_key]/denom)
303 |         return True
304 | 
305 |     return False
306 | 
307 | 
308 | def _off_policy_monte_carlo(MF, off_policy, n_episodes, max_steps, first_visit,
309 |     ordinary, optimize, sample_step):
310 | 
311 |     γ = MF.gamma
312 |     b = off_policy 
313 |     π = MF.policy
314 | 
315 |     samples = []
316 | 
317 |     v, q = np.zeros(MF.states.N), np.zeros((MF.states.N, MF.actions.N))
318 |     c, c_q = np.zeros(MF.states.N), np.zeros((MF.states.N, MF.actions.N))
319 | 
320 |     for n_episode in tqdm(range(int(n_episodes)), desc='Off-policy MC', unit='episodes'):
321 |         G = 0.
322 |         s_0, a_0 = MF.random_sa(value=True)
323 |         episode = MF.generate_episode(s_0, a_0, b, max_steps)
324 |         sar = np.array(episode)
325 |         s, a, _ = sar.T
326 | 
327 |         w = 1.
328 |         for t, (s_t, a_t, r_tt) in enumerate(sar[::-1]):
329 |             if w < 1E-10:
330 |                 break
331 | 
332 |             s_t, a_t = int(s_t), int(a_t)
333 |             
334 |             rho = π.pi_as(a_t, s_t)/b.pi_as(a_t, s_t)
335 |             w = w*rho 
336 |             
337 |             G = γ*G + r_tt
338 |             update = _mc_step_off(q, v, t, s_t, a_t, s, a, 
339 |                 G, w, c, c_q, first_visit, ordinary)
340 |             
341 |             if update and optimize:
342 |                 π.update_policy(q, s_t) 
343 |         
344 |         if sample_step and n_episode % sample_step == 0:
345 |             samples.append(get_sample(MF, v, q, π, n_episode, optimize))
346 |     
347 |     return v, q, samples
348 | 
349 | 
350 | 
351 | def tdn(states: Sequence[Any], actions: Sequence[Any], transition: Transition,
352 |     state_0: Any=None, action_0: Any=None, gamma: float=0.9, n: int=1, 
353 |     alpha: float=0.05, n_episodes: int=MAX_ITER, policy: ModelFreePolicy=None, 
354 |     eps: float=None, optimize: bool=False, method: str='sarsa', samples: int=1000, 
355 |     max_steps: int=MAX_STEPS) -> Tuple[VQPi, Samples]:
356 |     '''N-temporal differences algorithm.
357 | 
358 |     Temporal differences algorithm for estimating the value function of a
359 |     policy, improve it and analyze it.
360 | 
361 |     Parameters
362 |     ----------
363 |     states : Sequence[Any]
364 |     actions : Sequence[Any]
365 |     transition : Callable[[Any,Any],[[Any,float], bool]]]
366 |         transition must be a callable function that takes as arguments the
367 |         (state, action) and returns (new_state, reward), end.
368 |     state_0 : Any, optional
369 |         Initial state, by default None (random)
370 |     action_0 : Any, optional
371 |         Initial action, by default None (random)
372 |     gamma : float, optional
373 |         Discount factor, by default 0.9
374 |     n : int, optional
375 |         Number of steps to look ahead, by default 1
376 |     alpha : float, optional
377 |         Learning rate, by default 0.1
378 |     n_episodes : int, optional
379 |         Number of episodes to simulate, by default 1E4
380 |     max_steps : int, optional
381 |         Maximum number of steps per episode, by default 1E3
382 |     policy : ModelFreePolicy, optional
383 |         Policy to use, by default equal probability ModelFreePolicy
384 |     eps : float, optional
385 |         Epsilon value for the epsilon-soft policy, by default None (no exploration)
386 |     optimize : bool, optional
387 |         Whether to optimize the policy or not, by default False
388 |     samples : int, optional
389 |         Number of samples to take, by default 1000
390 |     
391 |     Returns
392 |     -------
393 |     vqpi : Tuple[VPi, QPi, Policy]
394 |         Value function, action-value function, policy and samples if any.
395 |     samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 
396 |         Samples taken during the simulation if any. The first element is the
397 |         index of the iteration, the second is the value function, the third is
398 |         the action-value function and the fourth is the TODO:.
399 | 
400 |     Raises
401 |     ------
402 |     TypeError: If any of the arguments is not of the correct type.
403 | 
404 |     Examples
405 |     --------
406 |     Define state action pairs
407 |     >>> from rl import tdn
408 |     >>> states = [0]
409 |     >>> actions = ['left', 'right']
410 |     Define the transition method, taking (state, action)
411 |     and returning (new_state, reward), end
412 |     >>> def state_transition(state, action):
413 |     >>>   if action == 'right':
414 |     >>>     return (state, 0), True
415 |     >>>   if action == 'left':
416 |     >>>     threshold = np.random.random()
417 |     >>>   if threshold > 0.9:
418 |     >>>     return (state, 1), True
419 |     >>>   else:
420 |     >>>     return (state, 0), False
421 |     Solve!
422 |     >>> tdn(states, actions, state_transition, gamma=1, n=3, alpha=0.05)
423 |     (array([0.134]), array([[0.513., 0.]]), <class 'ModelFreePolicy'>, None)
424 |     '''    
425 |     policy = _set_policy(policy, eps, actions, states)
426 | 
427 |     if method not in METHODS:
428 |         raise ValueError(
429 |             f'Unknown method {method}\n'
430 |             'Available methods are (sarsa, sarsa_on, qlearning, expected_sarsa'
431 |             ', dqlearning)')
432 | 
433 |     _typecheck_all(tabular_idxs=[states,actions], transition=transition,
434 |         constants=[gamma, n, alpha, n_episodes, samples, max_steps], 
435 |         booleans=[optimize], policies=[policy])
436 | 
437 |     sample_step = _get_sample_step(samples, n_episodes)
438 | 
439 |     model = ModelFree(states, actions, transition, gamma=gamma, policy=policy)  
440 |     
441 |     _tdn = METHOD_MAP[method]
442 | 
443 |     v, q, samples = _tdn(model, state_0, action_0, n, alpha, int(n_episodes),
444 |         max_steps, optimize, method, sample_step)
445 |     
446 |     return VQPi((v, q, policy)), samples
447 | 
448 | 
449 | def _td_step(s, a, r, t, T, n, v, q, γ, α, gammatron, π=None):
450 |     '''td step update'''
451 |     s_t, a_t, rr = s[t], a[t], r[t:t+n]
452 |     G = np.dot(gammatron[:rr.shape[0]], rr)
453 |     G_v, G_q = G, G
454 |     if t + n < T:
455 |         G_v = G_v + (γ**n) * v[s[t+n]]
456 |         G_q = G_q + (γ**n) * q[s[t+n], a[t+n]]
457 | 
458 |     v[s_t] = v[s_t] + α * (G_v - v[s_t])
459 |     q_key = (s_t, a_t)
460 |     q[q_key] = q[q_key] + α * (G_q - q[q_key])
461 |     
462 | 
463 | def _td_qlearning(s, a, r, t, T, n, v, q, γ, α, gammatron, π=None):
464 |     '''td qlearning update'''
465 |     s_t, a_t, rr = s[t], a[t], r[t:t+n]
466 |     G = np.dot(gammatron[:rr.shape[0]], rr)
467 |     if t + n < T:
468 |         G = G + (γ**n) * np.max(q[s[t+n]])
469 | 
470 |     v[s_t] = v[s_t] + α * (G - v[s_t])
471 |     q_key = (s_t, a_t)
472 |     q[q_key] = q[q_key] + α * (G - q[q_key])
473 |     
474 | 
475 | def _td_expected_sarsa(s, a, r, t, T, n, v, q, γ, α, gammatron, π=None):
476 |     s_t, a_t, rr = s[t], a[t], r[t:t+n]
477 |     G = np.dot(gammatron[:rr.shape[0]], rr)
478 |     if t + n < T:
479 |         G = G + (γ**n) * np.dot(π.pi[s[t+n]], q[s[t+n]])
480 |     
481 |     v[s_t] = v[s_t] + α * (G - v[s_t])
482 |     q_key = (s_t, a_t)
483 |     q[q_key] = q[q_key] + α * (G - q[q_key])
484 | 
485 | 
486 | STEP_MAP = {
487 |     'sarsa': _td_step,
488 |     'qlearning': _td_qlearning,
489 |     'expected_sarsa': _td_expected_sarsa,  
490 | }
491 | 
492 | 
493 | def _tdn_onoff(MF, s_0, a_0, n, alpha, n_episodes, max_steps, optimize, 
494 |     method, sample_step):
495 |     '''N-temporal differences algorithm.
496 |     
497 |     This is the basic implementation of the N-temporal difference algorithm. 
498 |     When optimizing the policy, the method for updating will be quasi-off 
499 |     policy. That is the updates are taking place with respect to the q-values
500 |     updated on each step, but each step corresponds to the old policy. This 
501 |     implies that at the beginning of the updates are strictly on policy, and 
502 |     at the end, when probably all the states have been visited, the updates 
503 |     are off policy. 
504 |     '''
505 |     π = MF.policy
506 |     α = alpha
507 |     γ = MF.gamma
508 |     gammatron = np.array([γ**i for i in range(n)])
509 |     v, q = MF.init_vq()
510 | 
511 | 
512 |     f_step = STEP_MAP[method]
513 | 
514 |     samples = []
515 |     for n_episode in tqdm(range(n_episodes), desc=f'td({n-1})', unit='episode'):
516 |         if not s_0:
517 |            s_0, _ = MF.random_sa(value=True) 
518 |         if not a_0:
519 |             _, a_0 = MF.random_sa(value=True)
520 |         episode = MF.generate_episode(s_0, a_0, π, max_steps)
521 |         
522 |         sar = np.array(episode)
523 |         s, a, r = sar[:,0], sar[:,1], sar[:,2]
524 |         
525 |         s = s.astype(int)
526 |         a = a.astype(int)
527 | 
528 |         T = s.shape[0]
529 |         for t in range(T):
530 |             f_step(s, a, r, t, T, n, v, q, γ, α, gammatron, π)
531 |             # episode is already set so next step is not generated
532 |             # via a greedy strategy, each episode generation is greedy
533 |             if optimize:  
534 |                 # in/out-place update for current and next episode
535 |                 # off policy without importance weighting
536 |                 π.update_policy(q, s[t]) 
537 |         
538 |         if sample_step and n_episode % sample_step == 0:
539 |             samples.append(get_sample(MF, v, q, π, n_episode, optimize))
540 |     
541 |     return v, q, samples
542 | 
543 | 
544 | def _td_dq_step(s, a, r, t, T, n, v1, q1, v2, q2, γ, α, gammatron, π):
545 |     '''td step update'''
546 |     s_t, a_t, rr = s[t], a[t], r[t:t+n]
547 |     G = np.dot(gammatron[:rr.shape[0]], rr)
548 |     G_v, G_q = G, G
549 |     if t + n < T:
550 |         G_v = G_v + (γ**n) * v2[s[t+n]]
551 |         G_q = G_q + (γ**n) * q2[s[t+n], np.argmax(q1[s[t+n]])]
552 | 
553 |     v1[s_t] = v1[s_t] + α * (G_v - v1[s_t])
554 |     q_key = (s_t, a_t)
555 |     q1[q_key] = q1[q_key] + α * (G_q - q1[q_key])
556 | 
557 | 
558 | def _double_q(MF, s_0, a_0, n, alpha, n_episodes, max_steps, optimize, 
559 |     method, sample_step):
560 | 
561 |     π, α, γ = MF.policy, alpha, MF.gamma
562 |     gammatron = np.array([γ**i for i in range(n)])
563 |     
564 |     v1, q1 = MF.init_vq()
565 |     v2, q2 = MF.init_vq()
566 |     v, q = MF.init_vq()
567 | 
568 |     samples = []
569 |     for n_episode in tqdm(range(n_episodes)):
570 |         s_0, a_0 = _set_s0_a0(MF, s_0, a_0)
571 |         episode = MF.generate_episode(s_0, a_0, π, max_steps)
572 |         
573 |         sar = np.array(episode)
574 |         s, a, r = sar[:,0], sar[:,1], sar[:,2]
575 |         
576 |         s = s.astype(int)
577 |         a = a.astype(int)
578 | 
579 |         T = s.shape[0]
580 |         for t in range(T):
581 |             if np.random.rand() < 0.5:
582 |                 _td_dq_step(s, a, r, t, T, n, v1, q1, v2, q2, γ, α, gammatron, π)
583 |             else:
584 |                 _td_dq_step(s, a, r, t, T, n, v2, q2, v1, q1, γ, α, gammatron, π)
585 | 
586 |             v = (v1 + v2)/2
587 |             q = (q1 + q2)/2
588 |             
589 |             if optimize:  
590 |                 π.update_policy(q, s[t])
591 |         
592 |         if sample_step and n_episode % sample_step == 0:
593 |             samples.append(get_sample(MF, v, q, π, n_episode, optimize))
594 |     
595 |     return v, q, samples
596 | 
597 | 
598 | def _tdn_on(MF, s_0, a_0, n, alpha, n_episodes, max_steps, optimize,
599 |     method, sample_step):
600 |     '''N-temporal differences algorithm for learning.
601 |     
602 |     Super slow and inefficient, but readable and replicated exactly
603 |     from sutton's n-step SARSA
604 |     '''
605 |     π, α, γ = MF.policy, alpha, MF.gamma
606 |     gammatron = np.array([γ**i for i in range(n)])
607 | 
608 |     v, q = MF.init_vq()
609 | 
610 |     samples = []
611 |     for n_episode in tqdm(range(n_episodes), desc=f'td({n-1}) variant', unit='episode'):
612 |         s_0, a_0 = _set_s0_a0(MF, s_0, a_0)
613 | 
614 |         s = MF.states.get_index(s_0)
615 |         a = MF.actions.get_index(a_0)
616 |         T = int(max_steps)
617 |         R, A, S, G = [], [a], [s], 0 
618 |         for t in range(T):
619 |             if t < T:
620 |                 (s, r), end = MF.step_transition(s, a)
621 |                 R.append(r)
622 |                 S.append(s)
623 |                 if end:
624 |                     T = t + 1
625 |                 else:
626 |                     a = π(s)
627 |                     A.append(a)
628 |             
629 |             tau = t - n + 1
630 |             if tau >= 0:
631 |                 rr = np.array(R[tau:min(tau+n, T)])
632 |                 G = gammatron[:rr.shape[0]].dot(rr)
633 |                 G_v, G_q = G, G
634 |                 if tau + n < T:
635 |                     G_v = G_v + γ**n * v[S[tau+n]]
636 |                     G_q = G_q + γ**n * q[S[tau+n], A[tau+n]]
637 |                 
638 |                 s_t = S[tau]
639 |                 a_t = A[tau]
640 |                 v[s_t] = v[s_t] + α * (G_v - v[s_t])
641 |                 q[(s_t, a_t)] = q[(s_t, a_t)] + α * (G_q - q[(s_t, a_t)])
642 |                 
643 |                 π.update_policy(q, s_t)
644 | 
645 |             if tau == T - 1:
646 |                 break
647 | 
648 |         if n_episode % sample_step == 0:
649 |             samples.append(get_sample(MF, v, q, π, n_episode, optimize))
650 | 
651 |     return v, q, samples
652 | 
653 | 
654 | METHOD_MAP = {
655 |     'sarsa_on': _tdn_on,
656 |     'sarsa': _tdn_onoff,
657 |     'qlearning': _tdn_onoff,
658 |     'expected_sarsa': _tdn_onoff, 
659 |     'dqlearning': _double_q
660 | }
661 | 
662 | 
663 | METHODS = METHOD_MAP.keys()
664 | 
665 | 
666 | def n_tree_backup(states: Sequence[Any], actions: Sequence[Any], transition: Transition,
667 |     state_0: Any=None, action_0: Any=None, gamma: float=1.0, n: int=1, 
668 |     alpha: float=0.05, n_episodes: int=MAX_ITER, policy: ModelFreePolicy=None, 
669 |     eps: float=None, optimize: bool=False, samples: int=1000, max_steps: int=MAX_STEPS
670 |     ) -> Tuple[VQPi, Samples]:
671 |     '''N-temporal differences algorithm.
672 | 
673 |     Temporal differences algorithm for estimating the value function of a
674 |     policy, improve it and analyze it.
675 | 
676 |     Parameters
677 |     ----------
678 |     states : Sequence[Any]
679 |     actions : Sequence[Any]
680 |     state_0 : Any, optional
681 |         Initial state, by default None (random)
682 |     action_0 : Any, optional
683 |         Initial action, by default None (random)
684 |     transition : Callable[[Any,Any],[[Any,float], bool]]]
685 |         transition must be a callable function that takes as arguments the
686 |         (state, action) and returns (new_state, reward), end.
687 |     gamma : float, optional
688 |         Discount factor, by default 0.9
689 |     n : int, optional
690 |         Number of steps to look ahead, by default 1
691 |     alpha : float, optional
692 |         Learning rate, by default 0.1
693 |     n_episodes : int, optional
694 |         Number of episodes to simulate, by default 1E4
695 |     max_steps : int, optional
696 |         Maximum number of steps per episode, by default 1E3
697 |     policy : ModelFreePolicy, optional
698 |         Policy to use, by default equal probability ModelFreePolicy
699 |     eps : float, optional
700 |         Epsilon value for the epsilon-soft policy, by default None (no exploration)
701 |     optimize : bool, optional
702 |         Whether to optimize the policy or not, by default False
703 |     samples : int, optional
704 |         Number of samples to take, by default 1000
705 |     
706 |     Returns
707 |     -------
708 |     vqpi : Tuple[VPi, QPi, Policy]
709 |         Value function, action-value function, policy and samples if any.
710 |     samples : Tuple[int, List[Vpi], List[Qpi], List[ModelFreePolicy]] 
711 |         Samples taken during the simulation if any. The first element is the
712 |         index of the iteration, the second is the value function, the third is
713 |         the action-value function and the fourth is the TODO:.
714 | 
715 |     Raises
716 |     ------
717 |     TransitionException: Ill defined transitions.
718 |     '''    
719 |     policy = _set_policy(policy, eps, actions, states)
720 | 
721 |     _typecheck_all(tabular_idxs=[states,actions], transition=transition,
722 |         constants=[gamma, n, alpha, n_episodes, samples, max_steps], 
723 |         booleans=[optimize], policies=[policy])
724 | 
725 |     sample_step = _get_sample_step(samples, n_episodes)
726 | 
727 |     model = ModelFree(states, actions, transition, gamma=gamma, policy=policy)  
728 |     
729 |     v, q, samples = _n_tree_backup(model, state_0, action_0, n, alpha, int(n_episodes),
730 |         max_steps, optimize, sample_step)
731 |     
732 |     return VQPi((v, q, policy)), samples
733 | 
734 | 
735 | def _n_tree_backup(MF, s_0, a_0, n, alpha, n_episodes, max_steps, 
736 |     optimize, sample_step):
737 | 
738 |     π, α, γ = MF.policy, alpha, MF.gamma
739 | 
740 |     v, q = MF.init_vq()
741 | 
742 |     samples = []
743 |     
744 |     for n_episode in tqdm(range(n_episodes), desc=f'{n}-Tree Backup', unit='episodes'):
745 |         s_0, a_0 = _set_s0_a0(MF, s_0, a_0)
746 | 
747 |         s = MF.states.get_index(s_0)
748 |         a = MF.actions.get_index(a_0)
749 |         T = int(max_steps)
750 |         R, A, S, G = [], [a], [s], 0 
751 |         
752 |         for t in range(T):
753 |             if t < T:
754 |                 (s, r), end = MF.step_transition(s, a)
755 |                 R.append(r)
756 |                 S.append(s)
757 |                 if end:
758 |                     T = t + 1
759 |                 else:
760 |                     _, a = MF.random_sa()
761 |                     A.append(a)
762 | 
763 |             tau = t - n + 1
764 |             if tau >= 0:
765 |                 if t + 1 >= T:
766 |                     G = R[-1]
767 |                 else:
768 |                     G = R[t] + γ*np.dot(π.pi[s[t]], q[s[t]])
769 | 
770 |                 for k in range(min(t, T-1), tau):
771 |                     G = R[k-1] + γ*np.dot(π.pi[s[k-1]], q[s[k-1]]) + \
772 |                         γ*π.pi[s[k-1],A[k-1]]*(G-q[s[k-1], A[k-1]])
773 |                 
774 |                 q[S[tau], A[tau]] = q[S[tau], A[tau]] + α[G-q[S[tau], A[tau]]] 
775 |                 
776 |                 if optimize:
777 |                     π.update_policy(q, S[tau])
778 | 
779 |             if tau == T - 1:
780 |                 break
781 | 
782 |         if n_episode % sample_step == 0:
783 |             samples.append(get_sample(MF, v, q, π, n_episode, optimize))
784 | 
785 |     return v, q, samples


--------------------------------------------------------------------------------
/rl/solvers/planning.py:
--------------------------------------------------------------------------------
  1 | from typing import (
  2 |     Tuple, 
  3 |     Sequence,  
  4 |     Any
  5 | )
  6 | 
  7 | from tqdm import tqdm
  8 | import numpy as np
  9 | from numpy.linalg import norm as lnorm
 10 | 
 11 | from rl.solvers.model_free import (
 12 |     get_sample, 
 13 |     _set_s0_a0,
 14 |     _set_policy,
 15 | )
 16 | from rl.model_free import ModelFree, ModelFreePolicy
 17 | from rl.utils import (
 18 |     UCTree,
 19 |     UCTNode,
 20 |     _typecheck_all,
 21 |     _get_sample_step,
 22 |     _check_ranges,
 23 |     VQPi,
 24 |     Samples,
 25 |     Transition,
 26 |     Vpi,
 27 |     Qpi,
 28 |     PQueue,
 29 |     MAX_ITER,
 30 |     MAX_STEPS,
 31 |     TOL
 32 | ) 
 33 | 
 34 | 
 35 | def dynaq(states: Sequence[Any], actions: Sequence[Any], transition: Transition,
 36 |     state_0: Any=None, action_0: Any=None, gamma: float=1.0, kappa: float=0.01, 
 37 |     n: int=1, plus: bool=False, alpha: float=0.05, n_episodes: int=MAX_ITER,
 38 |     policy: ModelFreePolicy=None, eps: float=None, samples: int=1000,
 39 |     max_steps: int=MAX_STEPS) -> Tuple[VQPi, Samples]:
 40 |     '''
 41 |     TODO: docs
 42 |     '''
 43 |     policy = _set_policy(policy, eps, actions, states)
 44 | 
 45 |     _typecheck_all(tabular_idxs=[states,actions], transition=transition,
 46 |         constants=[gamma, kappa, n, alpha, n_episodes, samples, max_steps], 
 47 |         booleans=[plus], policies=[policy])
 48 | 
 49 |     # check ranges
 50 |     
 51 |     sample_step = _get_sample_step(samples, n_episodes)
 52 | 
 53 |     model = ModelFree(states, actions, transition, gamma=gamma, policy=policy)
 54 |     v, q, samples = _dyna_q(model, state_0, action_0, n, alpha, kappa, plus,
 55 |         int(n_episodes), max_steps, sample_step)
 56 | 
 57 |     return VQPi((v, q, policy)), samples
 58 | 
 59 | 
 60 | def _dyna_q(MF, s_0, a_0, n, alpha, kappa, plus, n_episodes, max_steps,
 61 |     sample_step):
 62 | 
 63 |     π, α, γ, κ = MF.policy, alpha, MF.gamma, kappa
 64 | 
 65 |     v, q = MF.init_vq()
 66 |     
 67 |     S, A = MF.states.N, MF.actions.N
 68 |     model_sas = np.zeros((S, A), dtype=int)
 69 |     model_sar = np.zeros((S, A), dtype=float)
 70 |     times_sa = np.zeros((S, A), dtype=int)
 71 | 
 72 |     samples = []
 73 |     current_t = 0
 74 |     for n_episode in tqdm(range(n_episodes), desc='Dyna-Q', unit='episodes'):
 75 |         s_0, _ = _set_s0_a0(MF, s_0, None)
 76 | 
 77 |         s = MF.states.get_index(s_0)
 78 |         T = int(max_steps)
 79 |         
 80 |         for t in range(T):
 81 |             a = π(s)
 82 |             (s_, r), end = MF.step_transition(s, a) # real next state
 83 |             q[s, a] = q[s, a] + α*(r + γ*np.max(q[s_]) - q[s, a])
 84 |             
 85 |             times_sa[s, a] = current_t
 86 | 
 87 |             # assuming deterministic environment
 88 |             model_sas[s, a] = s_
 89 |             model_sar[s, a] = r
 90 |             
 91 |             current_t += 1
 92 | 
 93 |             for _ in range(n):
 94 |                 rs, ra = MF.random_sa()
 95 |                 s_m = model_sas[rs, ra] # model next state
 96 |                 r_ = model_sar[rs, ra]
 97 |                 R = r_
 98 |                 if plus:
 99 |                     tau = current_t - times_sa[rs, ra]
100 |                     R = R + κ*np.sqrt(tau)
101 |                 q[rs, ra] = q[rs, ra] + α*(R + γ*np.max(q[s_m]) - q[rs, ra])
102 |             
103 |             π.update_policy(q, s_)
104 |             s = s_ # current state equal next state
105 |             if end:
106 |                 break 
107 |         
108 |         if n_episode % sample_step == 0:
109 |             samples.append(get_sample(MF, v, q, π, n_episode, True))
110 | 
111 |     return v, q, samples
112 | 
113 | 
114 | def priosweep(states: Sequence[Any], actions: Sequence[Any], transition: Transition,
115 |     state_0: Any=None, action_0: Any=None, gamma: float=1.0, theta: float=0.01, 
116 |     n: int=1, plus: bool=False, alpha: float=0.05, n_episodes: int=MAX_ITER,
117 |     policy: ModelFreePolicy=None, eps: float=None, samples: int=1000,
118 |     max_steps: int=MAX_STEPS) -> Tuple[VQPi, Samples]:
119 |     '''
120 |     TODO: docs
121 |     '''
122 |     policy = _set_policy(policy, eps, actions, states)
123 | 
124 |     _typecheck_all(tabular_idxs=[states, actions], transition=transition,
125 |         constants=[gamma, theta, n, alpha, n_episodes, samples, max_steps], 
126 |         booleans=[plus], policies=[policy])
127 | 
128 |     # check ranges
129 |     
130 |     sample_step = _get_sample_step(samples, n_episodes)
131 | 
132 |     model = ModelFree(states, actions, transition, gamma=gamma, policy=policy)
133 |     v, q, samples = _priosweep(model, state_0, action_0, n, alpha, theta, 
134 |         int(n_episodes), max_steps, sample_step)
135 | 
136 |     return VQPi((v, q, policy)), samples
137 | 
138 | 
139 | def _priosweep(MF, s_0, a_0, n, alpha, theta, n_episodes, max_steps, 
140 |         sample_step):
141 | 
142 |     π, α, γ = MF.policy, alpha, MF.gamma
143 |     v, q = MF.init_vq()
144 |     
145 |     P, Pq, θ = 0, PQueue([]), theta 
146 | 
147 |     S, A = MF.states.N, MF.actions.N
148 |     model_sas = np.zeros((S, A), dtype=int)
149 |     model_sar = np.zeros((S, A), dtype=float)
150 |     times_sa = np.zeros((S, A), dtype=int)
151 | 
152 |     samples, current_t = [], 0
153 |     for n_episode in tqdm(range(n_episodes), desc='priosweep', unit='episodes'):
154 |         s_0, _ = _set_s0_a0(MF, s_0, None)
155 | 
156 |         s = MF.states.get_index(s_0)
157 |         T = int(max_steps)
158 |         
159 |         for t in range(T):
160 |             a = π(s)
161 |             (s_, r), end = MF.step_transition(s, a) # real next state
162 |             times_sa[s, a] = current_t
163 |             model_sas[s, a] = s_
164 |             model_sar[s, a] = r
165 | 
166 |             P = np.abs(r + γ*np.max(q[s_]) - q[s, a])
167 |             if P > θ:
168 |                 Pq.push((s, a), P)
169 |  
170 |             current_t += 1
171 | 
172 |             for _ in range(n):
173 |                 if Pq.empty():
174 |                     break
175 | 
176 |                 ps, pa = Pq.pop()
177 |                 s_m = model_sas[ps, pa] # model next state
178 |                 r_ = model_sar[ps, pa]
179 |                 R = r_
180 |                 
181 |                 q[ps, pa] = q[ps, pa] + α*(R + γ*np.max(q[s_m]) - q[ps, pa])
182 | 
183 |                 # grab all the index where model_sas == s
184 |                 mmask = (model_sas == s)
185 |                 for ss, aa in zip(*np.where(mmask)):
186 |                     rr = model_sar[ss, aa]
187 |                     P = np.abs(rr + γ*np.max(q[s]) - q[ss, aa])
188 |                     if P > θ:
189 |                         Pq.push((s, a), P)
190 |                 
191 |             π.update_policy(q, s_)
192 |             s = s_ # current state equal next state
193 |             if end:
194 |                 break 
195 |         
196 |         if n_episode % sample_step == 0:
197 |             samples.append(get_sample(MF, v, q, π, n_episode, True))
198 | 
199 |     return v, q, samples
200 | 
201 | 
202 | def t_sampling(states: Sequence[Any], actions: Sequence[Any], transition: Transition,
203 |     state_0: Any=None, action_0: Any=None, gamma: float=1.0,
204 |     n_episodes: int=MAX_ITER, policy: ModelFreePolicy=None, eps: float=None, 
205 |     samples: int=1000, optimize: bool=False, max_steps: int=MAX_STEPS
206 |     ) -> Tuple[VQPi, Samples]:
207 |     '''
208 |     TODO: docs
209 |     '''
210 |     policy = _set_policy(policy, eps, actions, states)
211 | 
212 |     _typecheck_all(tabular_idxs=[states,actions], transition=transition,
213 |         constants=[gamma, n_episodes, samples, max_steps], 
214 |         booleans=[optimize], policies=[policy])
215 | 
216 |     # TODO: check ranges
217 |     
218 |     sample_step = _get_sample_step(samples, n_episodes)
219 | 
220 |     model = ModelFree(states, actions, transition, gamma=gamma, policy=policy)
221 |     v, q, samples = _t_sampling(model, state_0, action_0, int(n_episodes),
222 |         optimize, max_steps, sample_step)
223 | 
224 |     return VQPi((v, q, policy)), samples
225 | 
226 | 
227 | def _t_sampling(MF, s_0, a_0, n_episodes, optimize, 
228 |     max_steps, sample_step):
229 |     
230 |     π, γ = MF.policy, MF.gamma
231 |     v, q = MF.init_vq()
232 |     
233 |     S, A = MF.states.N, MF.actions.N
234 |     n_sas = np.zeros((S, A, S), dtype=int) # p(s'|s,a) 
235 |     model_sar = np.zeros((S, A, S), dtype=float) # r(s,a,s') deterministic reward
236 | 
237 |     samples = []
238 |     for n_episode in tqdm(range(n_episodes), desc='Trajectory Sampling', unit='episodes'):
239 |         s, a = _set_s0_a0(MF, s_0, a_0)
240 |         a_ = MF.actions.get_index(a)
241 |         s = MF.states.get_index(s)
242 |         
243 |         for _ in range(int(max_steps)):
244 |             (s_, r), end = MF.step_transition(s, a_) # real next state
245 |             
246 |             n_sas[s, a, s_] += 1
247 |             model_sar[s, a, s_] = r # assumes deterministic reward
248 | 
249 |             # p_sas is the probability of transitioning from s to s'
250 |             p_sas = n_sas[s,a]/np.sum(n_sas[s, a]) 
251 |             next_s_mask = np.where(p_sas)[0]
252 |             max_q = np.max(q[next_s_mask, :], axis=1)
253 |             r_ns = model_sar[s, a, next_s_mask]
254 |             p_ns = p_sas[next_s_mask]
255 |             
256 |             q[s, a] = np.dot(p_ns, r_ns + γ*max_q)
257 | 
258 |             π.update_policy(q, s)
259 |             a_ = π(s_)
260 |             s = s_
261 | 
262 |             if end:
263 |                 break
264 | 
265 |         if n_episode % sample_step == 0:
266 |             samples.append(get_sample(MF, v, q, π, n_episode, optimize))
267 |     
268 |     return v, q, samples
269 | 
270 | 
271 | def rtdp():
272 |     raise NotImplementedError
273 | 
274 | 
275 | 
276 | def _best_child(v, Cp):
277 |     actions = np.array(list(v.children.keys()))
278 |     qs = np.array([v.children[a].q for a in actions])
279 |     ns = np.array([v.children[a].n for a in actions])
280 |     ucb = qs/ns + Cp*np.sqrt(np.log(v.n)/ns)
281 |     return v.children[actions[np.argmax(ucb)]]
282 |     
283 | 
284 | def _expand(v, transition, actions):
285 |     a = np.random.choice(list(actions))
286 |     (s_, _), end =  transition(v.state, a)
287 |     v_prime = UCTNode(s_, a, 0, 1, v, end)
288 |     v.children[a] = v_prime
289 |     return v_prime
290 | 
291 | 
292 | def _tree_policy(tree, Cp, transition, action_map, eps):
293 |     v = tree.root
294 |     while not v.is_terminal:
295 |         actions = action_map(v.state)
296 |         took_actions = v.children.keys()
297 |         unexplored = set(actions) - set(took_actions)
298 |         if not took_actions:
299 |             return _expand(v, transition, actions)
300 |         if unexplored and np.random.rand() < eps:
301 |             return _expand(v, transition, unexplored)
302 |         v = _best_child(v, Cp)
303 |     return v
304 | 
305 | 
306 | def _default_policy(v_leaf, transition, action_map, max_steps):
307 |     step, r = 0, 0
308 |     s = v_leaf.state
309 | 
310 |     if v_leaf.is_terminal:
311 |         return r
312 | 
313 |     while step < max_steps:
314 |         actions = action_map(s)
315 |         a = np.random.choice(actions)
316 |         (s, _r), end = transition(s, a)
317 |         r += _r
318 |         if end:
319 |             return 1
320 |         step += 1
321 |     return -1
322 |         
323 | 
324 | def _backup(v_leaf, delta):
325 |     v = v_leaf
326 |     while v:
327 |         v.n += 1
328 |         v.q += delta
329 |         v = v.parent
330 | 
331 | 
332 | def mcts(s0, Cp, budget, transition, action_map, max_steps, tree=None,
333 |     eps=1, verbose=True):
334 |     '''
335 |     Effectively implementing the UCT search algorithm
336 |     '''
337 |     s = s0
338 |     if not tree:
339 |         tree = UCTree(s, Cp)
340 |     for _ in tqdm(range(budget), desc='MCTS', disable=not verbose):
341 |         v_leaf = _tree_policy(tree, Cp, transition, action_map, eps)
342 |         delta = _default_policy(v_leaf, transition, action_map, max_steps)
343 |         _backup(v_leaf, delta)
344 |         
345 |     v_best = _best_child(tree.root, 0)
346 |     return v_best.action, tree
347 | 


--------------------------------------------------------------------------------
/rl/tiles.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tile Coding Software version 3.0beta
  3 | by Rich Sutton
  4 | based on a program created by Steph Schaeffer and others
  5 | External documentation and recommendations on the use of this code is available in the 
  6 | reinforcement learning textbook by Sutton and Barto, and on the web.
  7 | These need to be understood before this code is.
  8 | 
  9 | This software is for Python 3 or more.
 10 | 
 11 | This is an implementation of grid-style tile codings, based originally on
 12 | the UNH CMAC code (see http://www.ece.unh.edu/robots/cmac.htm), but by now highly changed. 
 13 | Here we provide a function, "tiles", that maps floating and integer
 14 | variables to a list of tiles, and a second function "tiles-wrap" that does the same while
 15 | wrapping some floats to provided widths (the lower wrap value is always 0).
 16 | 
 17 | The float variables will be gridded at unit intervals, so generalization
 18 | will be by approximately 1 in each direction, and any scaling will have 
 19 | to be done externally before calling tiles.
 20 | 
 21 | Num-tilings should be a power of 2, e.g., 16. To make the offsetting work properly, it should
 22 | also be greater than or equal to four times the number of floats.
 23 | 
 24 | The first argument is either an index hash table of a given size (created by (make-iht size)), 
 25 | an integer "size" (range of the indices from 0), or nil (for testing, indicating that the tile 
 26 | coordinates are to be returned without being converted to indices).
 27 | """
 28 | from math import floor, log
 29 | from itertools import zip_longest
 30 | 
 31 | basehash = hash
 32 | 
 33 | class IHT:
 34 |     "Structure to handle collisions"
 35 |     def __init__(self, sizeval):
 36 |         self.size = sizeval                        
 37 |         self.overfullCount = 0
 38 |         self.dictionary = {}
 39 | 
 40 |     def __str__(self):
 41 |         "Prepares a string for printing whenever this object is printed"
 42 |         return "Collision table:" + \
 43 |                " size:" + str(self.size) + \
 44 |                " overfullCount:" + str(self.overfullCount) + \
 45 |                " dictionary:" + str(len(self.dictionary)) + " items"
 46 | 
 47 |     def count(self):
 48 |         return len(self.dictionary)
 49 |     
 50 |     def fullp(self):
 51 |         return len(self.dictionary) >= self.size
 52 |     
 53 |     def getindex(self, obj, readonly=False):
 54 |         d = self.dictionary
 55 |         if obj in d: return d[obj]
 56 |         elif readonly: return None
 57 |         size = self.size
 58 |         count = self.count()
 59 |         if count >= size:
 60 |             if self.overfullCount==0: print('IHT full, starting to allow collisions')
 61 |             self.overfullCount += 1
 62 |             return basehash(obj) % self.size
 63 |         else:
 64 |             d[obj] = count
 65 |             return count
 66 | 
 67 | def hashcoords(coordinates, m, readonly=False):
 68 |     if type(m)==IHT: return m.getindex(tuple(coordinates), readonly)
 69 |     if type(m)==int: return basehash(tuple(coordinates)) % m
 70 |     if m==None: return coordinates
 71 | 
 72 | 
 73 | def tiles(ihtORsize, numtilings, floats, ints=[], readonly=False):
 74 |     """returns num-tilings tile indices corresponding to the floats and ints"""
 75 |     qfloats = [floor(f*numtilings) for f in floats]
 76 |     Tiles = []
 77 |     for tiling in range(numtilings):
 78 |         tilingX2 = tiling*2
 79 |         coords = [tiling]
 80 |         b = tiling
 81 |         for q in qfloats:
 82 |             coords.append( (q + b) // numtilings )
 83 |             b += tilingX2
 84 |         coords.extend(ints)
 85 |         Tiles.append(hashcoords(coords, ihtORsize, readonly))
 86 |     return Tiles
 87 | 
 88 | def tileswrap(ihtORsize, numtilings, floats, wrapwidths, ints=[], readonly=False):
 89 |     """returns num-tilings tile indices corresponding to the floats and ints, wrapping some floats"""
 90 |     qfloats = [floor(f*numtilings) for f in floats]
 91 |     Tiles = []
 92 |     for tiling in range(numtilings):
 93 |         tilingX2 = tiling*2
 94 |         coords = [tiling]
 95 |         b = tiling
 96 |         for q, width in zip_longest(qfloats, wrapwidths):
 97 |             c = (q + b%numtilings) // numtilings
 98 |             coords.append(c%width if width else c)
 99 |             b += tilingX2
100 |         coords.extend(ints)
101 |         Tiles.append(hashcoords(coords, ihtORsize, readonly))
102 |     return Tiles
103 | 


--------------------------------------------------------------------------------
/rl/utils.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | import warnings
  3 | from typing import (
  4 |     Any, 
  5 |     Sequence, 
  6 |     List, 
  7 |     Tuple,
  8 |     Callable,
  9 |     NewType
 10 | )
 11 | 
 12 | import numpy as np
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | plt.style.use("dark_background")
 16 | 
 17 | MAX_STEPS = 1E3
 18 | MAX_ITER = int(1E4)
 19 | TOL = 5E-8
 20 | MEAN_ITERS = int(1E4)
 21 | W_INIT = 1E-3
 22 | 
 23 | class Policy(ABC):
 24 |     def __init__(self):
 25 |         pass
 26 | 
 27 |     @abstractmethod
 28 |     def __call__(self, state: int = None) -> int:
 29 |         raise NotImplementedError
 30 | 
 31 |     @abstractmethod
 32 |     def update_policy(self, *args, **kwargs):
 33 |         raise NotImplementedError
 34 | 
 35 | 
 36 | class _TabularIndexer():
 37 |     '''Simple proxy for tabular state & actions.'''
 38 |     def __init__(self, seq: Sequence[Any]):
 39 |         self.seq = seq
 40 |         self.N = len(seq)
 41 |         self.index = {v: i for i, v in enumerate(seq)}
 42 |         self.revindex = {i: v for i, v in enumerate(seq)}
 43 | 
 44 |     def get_index(self, v) -> Any:
 45 |         return self.index[v]
 46 |     
 47 |     def from_index(self, idx) -> Any:
 48 |         return self.revindex[idx]
 49 | 
 50 |     def random(self, value=False):
 51 |         rnd_idx = np.random.choice(self.N)
 52 |         if value:
 53 |             return self.seq[rnd_idx]
 54 |         return rnd_idx
 55 | 
 56 | 
 57 | class State(_TabularIndexer):
 58 |     pass
 59 | 
 60 | 
 61 | class Action(_TabularIndexer):
 62 |     pass
 63 | 
 64 | 
 65 | class StateAction(_TabularIndexer):
 66 |     pass
 67 | 
 68 | 
 69 | class _TabularValues:
 70 |     def __init__(self, values: np.ndarray, idx: _TabularIndexer):
 71 |         self.v = values
 72 |         self.idx = idx
 73 |         self.idx_val = {k:v for k,v in zip(idx.index.keys(), values)}
 74 | 
 75 |     def values(self):
 76 |         return self.v
 77 | 
 78 | 
 79 | class Vpi(_TabularValues):
 80 |     def __str__(self):
 81 |         return f'Vpi({self.v[:5]}...)'
 82 | 
 83 | 
 84 | class Qpi(_TabularValues):
 85 |     def __str__(self):
 86 |         return f'Vpi({self.v[:5]}...)'
 87 | 
 88 | 
 89 | VQPi = NewType('VQPi', Tuple[Vpi, Qpi, Policy])
 90 | Samples = NewType('Samples', Tuple[int, List[Vpi], List[Qpi], List[Policy]])
 91 | Transition = Callable[[Any, Any], Tuple[Tuple[Any, float], bool]]
 92 | EpisodeStep = NewType('EpisodeStep', Tuple[int, int, float])
 93 | 
 94 | class TransitionException(Exception):
 95 |     pass
 96 | 
 97 | class PQueue:
 98 |     '''Priority Queue'''
 99 |     def __init__(self, items: List[Tuple[float, Any]]):
100 |         self.items = items
101 |         self._sort()
102 |     
103 |     def _sort(self):
104 |         self.items.sort(key=lambda x: x[0])
105 |     
106 |     def push(self, item, priority):
107 |         self.items.append((priority, item))
108 |         self._sort()
109 |     
110 |     def pop(self):
111 |         return self.items.pop(0)[1]
112 | 
113 |     def empty(self):
114 |         return len(self.items) == 0
115 | 
116 | 
117 | class RewardGenerator:
118 |     DISTRIBUTION = {
119 |         'bernoulli': np.random.binomial,
120 |         'gaussian': np.random.normal,
121 |         'uniform': np.random.uniform,
122 |         'exponential': np.random.exponential,
123 |         'poisson': np.random.poisson,
124 |         'pareto': np.random.pareto,
125 |         'triangular': np.random.triangular,
126 |     }
127 | 
128 |     @classmethod
129 |     def generate(self, distribution='normal', *args, **kwargs) -> float:
130 |         generator = self.DISTRIBUTION.get(distribution)
131 |         if not generator:
132 |             raise ValueError(f'Invalid distribution: {distribution}')
133 |         return generator(*args, **kwargs)
134 | 
135 | 
136 | class UCTNode:
137 |     def __init__(self, state, action, q, n, parent=None, is_terminal=False):
138 |         self.state = state
139 |         self.action = action
140 |         self.q = q
141 |         self.n = n
142 |         self.parent = parent
143 |         self.children = {}
144 |         self.is_terminal = False
145 |     
146 |     def add_child(self, child):
147 |         self.children[child.action] = child
148 |         return child
149 | 
150 | 
151 | class UCTree:
152 |     def __init__(self, root, Cp=1.0, max_steps=MAX_STEPS, nodes=None):
153 |         if not isinstance(root, UCTNode):
154 |             self.root = UCTNode(root, None, 0, 1, None)
155 |         else:
156 |             self.root = root
157 |         self.Cp = Cp
158 |         self.max_steps = max_steps
159 |         self.nodes = {} if not nodes else nodes
160 | 
161 |     def max_depth(self):
162 |         stack = [(self.root, 0)]
163 |         max_depth = 0
164 |         while stack:
165 |             node, depth = stack.pop()
166 |             max_depth = max(depth, max_depth)
167 |             for child in node.children.values():
168 |                 stack.append((child, depth+1))
169 |         return max_depth
170 | 
171 |     def plot(self):
172 |         max_depth = self.max_depth()
173 |         width = 4*max_depth
174 |         height = max_depth
175 |         stack = [(self.root, 0, 0, width)]
176 |         treenodes = []
177 |         lines = []
178 |         while stack:
179 |             node, depth, x, step = stack.pop()
180 |             node_pos = (x + step/2, height-depth)
181 |             treenodes.append(node_pos)
182 |             if node.children:
183 |                 n_childs = len(node.children)
184 |                 step = step/n_childs
185 |                 for i, child in enumerate(node.children.values()):
186 |                     stack.append((child, depth+1, x+i*step, step))
187 |                     lines.append((node_pos, (step/2 + x+i*step, height-depth-1)))
188 |         
189 |         fig = plt.figure(figsize=(10, 10))
190 |         ax = fig.add_subplot(111)
191 |         ax.set_xticks([])
192 |         ax.set_yticks([])
193 |         for node in treenodes:
194 |             ax.scatter(node[0], node[1], color='white', s=1)
195 |         for line in lines:
196 |             ax.plot([line[0][0], line[1][0]], 
197 |                     [line[0][1], line[1][1]], 
198 |                     color='white', linewidth=0.5)
199 |         plt.show()
200 | 
201 | 
202 | def _typecheck_tabular_idxs(*args):
203 |     for arg in args:
204 |         if not isinstance(arg, (Sequence, np.ndarray)):
205 |             raise TypeError(
206 |                 f"Tabular Indexes must be Sequence, not {type(arg)}")
207 |     
208 |  
209 | def _typecheck_transition(transition):
210 |     if not isinstance(transition, Callable):
211 |         raise TypeError(
212 |             f"transition must be a Callable, not {type(transition)}")
213 | 
214 |     if transition.__code__.co_argcount != 2:
215 |         raise TypeError(
216 |             f"transition must have two positional arguments,"
217 |             f" not {transition.__code__.co_argcount}")   
218 |  
219 | 
220 | def _typecheck_constants(*args):
221 |     for arg in args:
222 |         if not isinstance(arg, (float, int)):
223 |             raise TypeError(
224 |                 f"Constants must be float or int, not {type(arg)}")
225 | 
226 | 
227 | def _typecheck_booleans(*args):
228 |     for arg in args:
229 |         if not isinstance(arg, bool):
230 |             raise TypeError(
231 |                 f"Booleans must be bool, not {type(arg)}")
232 | 
233 | def _typecheck_policies(*args):
234 |     for arg in args:
235 |         if not isinstance(arg, Policy):
236 |             raise TypeError(
237 |                 f"Policies must be Policy, not {type(arg)}")
238 | 
239 | 
240 | def _typecheck_all(tabular_idxs=None, transition=None, constants=None,
241 |     booleans=None, policies=None):
242 |     if tabular_idxs:
243 |         _typecheck_tabular_idxs(*tabular_idxs)
244 |     if transition:
245 |         _typecheck_transition(transition)
246 |     if constants:
247 |         _typecheck_constants(*constants)
248 |     if booleans:
249 |         _typecheck_booleans(*booleans)
250 |     if policies:
251 |         _typecheck_policies(*policies)
252 | 
253 | 
254 | def _get_sample_step(samples, n_episodes):
255 |     if samples > n_episodes:
256 |         samples = n_episodes
257 |     if samples > 1E3:
258 |         samples = int(1E3)
259 |     sample_step = int(n_episodes / samples)
260 |     return sample_step
261 | 
262 | 
263 | def _check_ranges(values, ranges):
264 |     for v, r in zip(values, ranges):
265 |         if v < r[0] or v > r[1]:
266 |             raise ValueError(f"{v} is out of range {r}")
267 | 
268 | 
269 | def auto_cardinal(values, n, safe=True):
270 |     if (n+1)**len(values) > 2.5E6:
271 |         if safe:
272 |             raise ValueError("Too many combinations, may cause memory error,"
273 |                              "set safe=False to avoid raising this error")
274 |         else:
275 |             warnings.warn("Too many combinations, may cause memory error")
276 |     prod = np.array(np.meshgrid(*[values for _ in range(n)]))
277 |     return prod.T.reshape(-1, n)
278 | 
279 | 
280 | class BasisException(Exception):
281 |     pass
282 | 
283 | 
284 | def get_basis(self, basis, cij) -> Callable[[np.ndarray], np.ndarray]:
285 |     '''get basis function for linear approximator using polynomial or
286 |     fourier base
287 | 
288 |     Parameters
289 |     ----------
290 |     basis : str
291 |         Basis function to use, either 'poly' or 'fourier'
292 |     cij : np.ndarray
293 |         Coefficients for the basis function
294 |     
295 |     Returns
296 |     -------
297 |     basis: Callable[[np.ndarray], np.ndarray]
298 |         This function will not work on arbitrary defined states. Just on 
299 |         ones defined as sequences or numpy arrays. Any other type will 
300 |         raise an error.
301 |     '''
302 |     if basis == 'poly':
303 |         def _basis(s):
304 |             xs = [np.prod(s**cj) for cj in cij]
305 |             return np.array(xs)
306 | 
307 |     if basis == 'fourier':
308 |         def _basis(s):
309 |             xs = [np.cos(np.pi*np.dot(s, cj)) for cj in cij]
310 |             return np.array(xs)
311 | 
312 |     def basis_f(s):
313 |         try:
314 |             return _basis(s)
315 |         except Exception:
316 |             raise BasisException('State must be a sequence or numpy array')
317 |     return basis_f


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages  
2 | setup(name = 'rl', packages = find_packages())


--------------------------------------------------------------------------------