├── .gitignore ├── LICENSE ├── README.md ├── assets └── images │ ├── 2023-01-24-11-04-11.png │ ├── 2023-01-24-11-04-28.png │ ├── acc_vs_steps_alpha.png │ ├── accuracy_vs_steps_eps.png │ ├── blackjack_rms_vs_ep.png │ ├── blackjack_v.png │ ├── expected_r_vs_steps.png │ ├── expected_r_vs_steps_alpha.png │ ├── infinite_variance.png │ ├── maximization_bias.png │ ├── mc_vs_td.png │ ├── mountain_car_n1_vs_n8.png │ ├── n_effect.png │ ├── ordinary_vs_weighted.png │ ├── q_vs_sarsa.png │ ├── single_state.png │ ├── steps_per_episode_vs_episode.png │ ├── ucb_expected_reward_vs_steps.png │ ├── ucb_steps_vs_acc.png │ └── uct.png ├── docs ├── MDP.md └── MODELFREE.md ├── examples ├── blackjack.py ├── dyna_maze.py ├── gridworld.py ├── mcts.py ├── mountain_car.py ├── random_walk.py ├── short_corridor.py ├── single_state.py ├── state_aggregation.py └── windy_gridworld.py ├── requirements.txt ├── rl ├── __init__.py ├── approximators.py ├── armed_bandits.py ├── mdp.py ├── model_free.py ├── solvers │ ├── __init__.py │ ├── approx.py │ ├── model_based.py │ ├── model_free.py │ └── planning.py ├── tiles.py └── utils.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 ivan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning 2 | 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/YannDubs/disentangling-vae/blob/master/LICENSE) 4 | [![Python 3.8](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/release/python-360/) 5 | 6 | ## Installation 7 | 8 | ### setup.py 9 | ```sh 10 | $ python setup.py install 11 | ``` 12 | 13 | # Overview 14 | 15 | This repository contains code that implements algorithms and models from Sutton's book on reinforcement learning. The book, titled "Reinforcement Learning: An Introduction," is a classic text on the subject and provides a comprehensive introduction to the field. 16 | 17 | The code in this repository is organized into several modules, each of which covers differents topics. 18 | 19 | 20 | # Methods 21 | 22 | - [x] Multi Armed Bandits 23 | - [x] Epsilon Greedy 24 | - [x] Optimistic Initial Values 25 | - [x] Gradient 26 | - [x] α (non stationary) 27 | - [x] Model Based 28 | - [x] Policy Evaluation 29 | - [x] Policy Iteration 30 | - [x] Value Iteration 31 | - [x] Monte Carlo estimation and control 32 | - [x] First-visit α-MC 33 | - [x] Every-visit α-MC 34 | - [x] MC with Exploring Starts 35 | - [x] Off-policy MC, ordinary and weighted importance sampling 36 | - [x] Temporal Difference 37 | - [x] TD(n) estimation 38 | - [x] n-step SARSA 39 | - [x] n-step Q-learning 40 | - [x] n-step Expected SARSA 41 | - [x] double Q learning 42 | - [x] n-step Tree Backup 43 | - [x] Planning 44 | - [x] Dyna-Q/Dyna-Q+ 45 | - [x] Prioritized Sweeping 46 | - [x] Trajectory Sampling 47 | - [x] MCTS 48 | - [ ] On-policy Prediction 49 | - [x] Gradient MC 50 | - [x] $n$-step semi-gradient TD 51 | - [ ] ANN 52 | - [ ] Least-Squares TD 53 | - [ ] Kernel-based 54 | - [x] On-policy Control 55 | - [x] Episodic semi-gradient 56 | - [x] Semi-gradient n-step Sarsa 57 | - [x] Differential Semi-gradient n-step Sarsa 58 | - [ ] Elegibility Traces 59 | - [x] TD($\lambda$) 60 | - [ ] True Online 61 | - [x] Sarsa($\lambda$) 62 | - [ ] True Online Sarsa($\lambda$) 63 | - [ ] Policy Gradient 64 | - [x] REINFORCE: Monte Carlo Policy Gradient w/wo Baseline 65 | - [ ] Actor-Critic (episodic) w/wo eligibility traces 66 | - [ ] Actor-Critic (continuing) with eligibility traces 67 |
68 | 69 | All model free solvers will work just by defining `states` `actions` and a `trasition` function. Transitions are defined as a function that takes a state and an action and returns a tuple of the next state and the reward. The transition function also returns a boolean indicating whether the episode has terminated. 70 | 71 | ```python 72 | states: Sequence[Any] 73 | actions: Sequence[Any] 74 | transtion: Callable[[Any, Any], Tuple[Tuple[Any, float], bool]] 75 | ``` 76 | 77 | # Examples 78 | 79 | **Single State Infinite Variance Example 5.5** 80 | 81 | ![](https://github.com/ivanbelenky/RL/blob/master/assets/images/single_state.png) 82 | 83 | 84 | ```python 85 | from mypyrl import off_policy_mc, ModelFreePolicy 86 | 87 | states = [0] 88 | actions = ['left', 'right'] 89 | 90 | def single_state_transition(state, action): 91 | if action == 'right': 92 | return (state, 0), True 93 | if action == 'left': 94 | threshold = np.random.random() 95 | if threshold > 0.9: 96 | return (state, 1), True 97 | else: 98 | return (state, 0), False 99 | 100 | b = ModelFreePolicy(actions, states) #by default equiprobable 101 | pi = ModelFreePolicy(actions, states) 102 | pi.pi[0] = np.array([1, 0]) 103 | 104 | # calculate ordinary and weighted samples state value functions 105 | vqpi_ord, samples_ord = off_policy_mc(states, actions, single_state_transition, 106 | policy=pi, b=b, ordinary=True, first_visit=True, gamma=1., n_episodes=1E4) 107 | 108 | vqpi_w, samples_w = off_policy_mc(states, actions, single_state_transition, 109 | policy=pi, b=b, ordinary=False, first_visit=True, gamma=1., n_episodes=1E4) 110 | ``` 111 | 112 | ![](https://github.com/ivanbelenky/RL/blob/master/assets/images/ordinary_vs_weighted.png) 113 | 114 |
115 | 116 | **Monte Carlo Tree Search maze solving plot** 117 | 118 | ```python 119 | s = START_XY 120 | budget = 500 121 | cp = 1/np.sqrt(2) 122 | end = False 123 | max_steps = 50 124 | while not end: 125 | action, tree = mcts(s, cp, budget, obstacle_maze, action_map, max_steps, eps=1) 126 | (s, _), end = obstacle_maze(s, action) 127 | 128 | tree.plot() 129 | ``` 130 | 131 | ![](https://github.com/ivanbelenky/RL/blob/master/assets/images/uct.png) 132 | 133 |
134 | 135 | # Contributing 136 | 137 | While the code in this package provides a basic implementation of the algorithms from the book, it is not necessarily the most efficient or well-written. If you have suggestions for improving the code, please feel free to open an issue. 138 | 139 | Overall, this package provides a valuable resource for anyone interested in learning about reinforcement learning and implementing algorithms from scratch. By no means prod ready. 140 | -------------------------------------------------------------------------------- /assets/images/2023-01-24-11-04-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/2023-01-24-11-04-11.png -------------------------------------------------------------------------------- /assets/images/2023-01-24-11-04-28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/2023-01-24-11-04-28.png -------------------------------------------------------------------------------- /assets/images/acc_vs_steps_alpha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/acc_vs_steps_alpha.png -------------------------------------------------------------------------------- /assets/images/accuracy_vs_steps_eps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/accuracy_vs_steps_eps.png -------------------------------------------------------------------------------- /assets/images/blackjack_rms_vs_ep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/blackjack_rms_vs_ep.png -------------------------------------------------------------------------------- /assets/images/blackjack_v.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/blackjack_v.png -------------------------------------------------------------------------------- /assets/images/expected_r_vs_steps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/expected_r_vs_steps.png -------------------------------------------------------------------------------- /assets/images/expected_r_vs_steps_alpha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/expected_r_vs_steps_alpha.png -------------------------------------------------------------------------------- /assets/images/infinite_variance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/infinite_variance.png -------------------------------------------------------------------------------- /assets/images/maximization_bias.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/maximization_bias.png -------------------------------------------------------------------------------- /assets/images/mc_vs_td.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/mc_vs_td.png -------------------------------------------------------------------------------- /assets/images/mountain_car_n1_vs_n8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/mountain_car_n1_vs_n8.png -------------------------------------------------------------------------------- /assets/images/n_effect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/n_effect.png -------------------------------------------------------------------------------- /assets/images/ordinary_vs_weighted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/ordinary_vs_weighted.png -------------------------------------------------------------------------------- /assets/images/q_vs_sarsa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/q_vs_sarsa.png -------------------------------------------------------------------------------- /assets/images/single_state.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/single_state.png -------------------------------------------------------------------------------- /assets/images/steps_per_episode_vs_episode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/steps_per_episode_vs_episode.png -------------------------------------------------------------------------------- /assets/images/ucb_expected_reward_vs_steps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/ucb_expected_reward_vs_steps.png -------------------------------------------------------------------------------- /assets/images/ucb_steps_vs_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/ucb_steps_vs_acc.png -------------------------------------------------------------------------------- /assets/images/uct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/assets/images/uct.png -------------------------------------------------------------------------------- /docs/MDP.md: -------------------------------------------------------------------------------- 1 | # Markov Decision Process (MDP) Framework 2 | 3 | This code provides a framework for defining and solving Markov Decision Processes (MDPs) in Python. It includes classes for defining MDPs, policies, and rewards, as well as functions for solving MDPs using various algorithms. 4 | 5 | ## Classes 6 | ### `MarkovReward` 7 | 8 | The `MarkovReward` class is an abstract base class for generating rewards in an MDP. It defines the `generate()` and `r_sas()` methods, which must be implemented by subclasses. 9 | TabularReward 10 | 11 | The `TabularReward` class is a concrete implementation of MarkovReward that uses a reward table to generate rewards. It has a constructor that takes a reward table `r_sa` as an input and stores it internally. The `generate()` method returns the reward for a given state and action, and the `r_sas()` method returns the mean reward for the next state. It could be considered as nonsense to create a class just to hold this up, but the idea is to be able to define arbitrarily reward generator functions for each state action pair. This suggests that we are just going to be dealing with independent $p(r,s'|s,a)$. Even continuous ones that 12 | 13 | ### `MarkovPolicy` 14 | 15 | The `MarkovPolicy` class extends the `Policy` abstract base class and defines a policy for an `MDP`. It has a constructor that takes a policy table `pi_sa` as an input and stores it internally. The `update_policy()` method updates the policy using the given value function `q_pi`, and the `π()` method returns the policy for a given state. 16 | MDP. 17 | 18 | ### `MDP` 19 | 20 | The `MDP` class represents an MDP and provides methods for solving it. It has a constructor that takes a state transition matrix `p_s`, a list of states, a list of actions, a discount factor `gamma`, and optional `policy` and `reward_gen` objects. The `value_function()` and `optimal_policy()` methods can be used to compute the value function and optimal policy for the MDP using various solvers. 21 | 22 | ### `Solvers` 23 | 24 | The code includes a number of solver functions for computing the value function and optimize policies for an MDP, including `vq_pi_iter_naive`, `policy_iteration`, and `value_iteration`. These solvers can be used with the `MDP` class's `value_function()` and `optimal_policy()` methods to solve an MDP. 25 | 26 |
27 | 28 | # **Dynamic Programming (DP) the cool kid in town** 29 | 30 | DP is the cool kid in town, since everybody is trying to copy him in some or other way. This does not mean that he is the coolest. 31 | 32 | DP is a collection of algorithms that can be used to compute optimal policies 33 | given a perfect model of the environment as MDP. Extremely limited, given 34 | the great computational expense. All methods can be thought as trying to 35 | achieve same effect. 36 | 37 | MDP is finite. Even given continuous examples the approach taken is always 38 | to quantize it. 39 | 40 | Optimality equations are operators. The system is guaranteed to have a 41 | solution and converge if gamma < 1 or the runs are episodic. If the 42 | tasks are completely known then this is just a system of linear equations. 43 | 44 | The bellman equation itself, is an operator/mapping whose fixed point is the value function. It is usually called the **Bellman Expectation Operator** or the **Bellman Policy Operator**. It can be proven to converge under reasonable assumptions, like $\gamma < 1$. This method is noted as **iterative policy evaluation**. 45 | 46 |
47 | 48 | ## **Iterative Evaluation** 49 | 50 | The iterative solution to the expected policy value function, would be written as 51 | 52 | $$ \color{orange} 53 | v_{k+1}(s) = { \sum_{a \in A} \pi(a|s) \sum_{s', r} p(s,r|s',a)[ r + \gamma v_k(s')] = \operatorname{B_{\pi}}[v_{k}(s)] } 54 | $$ 55 | 56 | where $\operatorname{B_{\pi}}$ is the **Bellman Expectation Operator**, naturally, to considering the bellman equality equation as an operator that acts on the value function. It is easy to see that the actual value function is a fixed point of this operator. 57 | 58 | $$\color{orange} 59 | \operatorname{B_{\pi}}[v_{\pi}] = v_{\pi} 60 | $$ 61 | 62 | 63 | it is easy to show that $\operatorname{B_{\pi}}$ is a contraction mapping under the $L_\infty$ norm 64 | 65 | $$\color{orange} 66 | \begin{aligned} 67 | \left|\left|\operatorname{B_{\pi}}[v] - \operatorname{B_{\pi}}[u]\right|\right|_\infty &= \\ \\ 68 | &= \gamma \|\| \sum_{a \in A} \pi(a|s) \sum_{s', r} p(s,r|s',a)[v(s') - u(s')]\|\|_\infty \\ \\ 69 | &\leq \gamma ||v - u||_\infty 70 | \end{aligned} 71 | $$ 72 | 73 | Given that there is exactly one value function we can show that 74 | 75 | $$\color{orange} 76 | \lim_{k\rightarrow \infty} \operatorname{B_{\pi}}[v_0] = v_{\pi} 77 | $$ 78 | 79 | given the fact that 80 | 81 | $$\color{orange} 82 | \left|\left|v_{k} - v_{\pi} \right|\right| = \left|\left| \operatorname{B_{\pi}}[v_{k-1}] - \operatorname{B_{\pi}}v_{\pi} \right|\right| \leq \gamma \left|\left| v_{k-1} - v_{\pi} \right|\right| \leq \cdots \leq \gamma^k \left|\left| v_{0} - v_{\pi} \right|\right| 83 | $$ 84 | 85 |
86 | 87 | ## **Policy Improvement** 88 | 89 | As the title suggest, dynamic programming also encompasses methods to solve the optimal problem, that is the best policy there is given an MDP. In the same fashion we can define Optimality Equations for the value function. It can be easily proven by the absurd that the following is true for 90 | 91 | $$\color{orange} 92 | v_{*} = \max_{a \in A} q_{\pi_{*}}(s,a) = \max_{a \in A} \sum_{s', r} p(s,r|s',a)[ r + \gamma v_{*}(s')] 93 | $$ 94 | 95 | 96 | Mouthful absurd. If the above is not true it is possible to: 97 | - define $\pi'(s)$ that modifies the policy for all states with the above rule 98 | - new policy chooses the action that maximizes the state value function for every $s$. 99 | - calculate the value function with this new policy 100 | - when we encounter each state again the new policy is going to kick in. Since it always gives more reward, and since the value function is composed by discounted rewards, we now have a higher value function policy. Hence an absurd, since $v_{*}$ was already the optimal. 101 | 102 | If this is still not clear was a mouthful see that 103 | 104 | $$\color{orange} 105 | \begin{aligned} 106 | v_{\pi}(s) &\leq q_{\pi}(s, \pi'(s)) \\ 107 | &= \mathbb{E_{\pi'}}[r+\gamma v_{\pi}(s')]\\ 108 | &\leq \mathbb{E_{\pi'}}[r+\gamma q_{\pi}(s', \pi'(s'))]\\ 109 | &= \mathbb{E_{\pi'}}[r+\gamma r + \gamma^2 v_{\pi}(s', \pi'(s'))]\\ 110 | & \ \ \vdots \\ 111 | &\leq \mathbb{E_{\pi'}}\left[\sum_k r_k \gamma^k \right]\\ 112 | &= v_{\pi'(s)}(s) 113 | \end{aligned} 114 | $$ 115 | 116 | ### **Policy Iteration** 117 | 118 | It is precisely the policy improvement theorem the one that guarantees that the policy iteration techniques will converge to the optimal policy after going under a couple of iterations. 119 | 120 | As Sutton, illustrates the Policy Iteration algorithm consists of the following 121 | 122 | $$\color{orange} 123 | \pi_0 \overset{\mathbb{E}}{\longrightarrow} v_{\pi_0} \overset{\mathbb{I}}{\longrightarrow} \pi_1 \overset{\mathbb{E}}{\longrightarrow} \cdots \overset{\mathbb{I}}{\longrightarrow} \pi_{*} \overset{\mathbb{E}}{\longrightarrow} v_{\pi_{*}} 124 | $$ 125 | 126 | So this particular solution is quite costly since we have to perform an evaluation step every single time the policy changes, and this is costly, mostly in iterative settings. But there are good news, that is Value Iteration. 127 | 128 | ### **Value Iteration** 129 | 130 | $$\color{orange} 131 | v_{\pi}(s)=\sum_{a \in A} \pi(a|s)q_{\pi}(s,a) 132 | $$ 133 | 134 | We define then the **Bellman Optimality Operator** as 135 | 136 | $$\color{orange} 137 | \operatorname{B_{*}}[v(s)] := \max_a[r(s,a) + \gamma v(s)] 138 | $$ 139 | 140 | and we can show that it is a contraction mapping under the $L_\infty$ norm once again, with the help of the following property 141 | 142 | $$\color{orange} 143 | |\max_a f(a) - \max_a g(a) | \leq \max_a |f(a) - g(a)| 144 | $$ 145 | 146 | then 147 | 148 | $$\color{orange} 149 | \begin{aligned} 150 | \left|\left|\operatorname{B_{*}}[v] - \operatorname{B_{*}}[u]\right|\right|_\infty &= \\ \\ 151 | &= \gamma \|\| \max_a \sum_{s', r} p(s,r|s',a)[v(s') - u(s')]\|\|_\infty \\ \\ 152 | &\leq \gamma ||v - u||_\infty 153 | \end{aligned} 154 | $$ 155 | 156 | once again the optimal value function is a fixed point of the Bellman Optimality Operator, i.e. 157 | 158 | $$\color{orange} 159 | v_{*} = \operatorname{B_{*}}[v_{*}] 160 | $$ 161 | 162 | implying that an iterative approach can be built such that 163 | 164 | $$\color{orange} 165 | v_{k+1} = \operatorname{B_{*}}[v_{k}] 166 | $$ 167 | 168 | Since we are guaranteed convergence, we can basically apply policy iteration but going for iterative policy evaluation just with one step. 169 | 170 | 171 | ### **Drawbacks of DP** 172 | 173 | It is evident that for problems with massive state,action spaces, even with high power compute, DP is not a cost-effective solution. This is because the time complexity of DP is polynomial in the size of the state and action space (assuming the action space stays constant, if it also grows, even worse naturally). 174 | 175 | Asynchronous DP is a solution to this problem, mainly as the name suggests it suggests that there is not a synchronicity of updates between improvement steps. As Sutton states _"Of course, avoiding sweeps does not necessarily mean that we can get away with less computation. It just means that an algorithm does not need to get locked into any hopeleslly long sweep before it can make progress improving a policy"_. 176 | 177 | A few words on Generalized Policy Iteration. 178 | 179 | ![](/assets/images/2023-01-24-11-04-28.png) 180 | 181 | There is a nice inutition behind what is going on whenever we are performing any of the methods described above. By the word greedy, we are stating, that we are going to choose the action that maximizes the value function for the next state locally, not taking into account that we might be selecting this policy with regards to an _outdated_ value function. But given the niceness of the Operators, and by niceness I mean, they are contraction mappings, we are left with the following picture 182 | 183 | ![](/assets/images/2023-01-24-11-04-11.png) 184 | 185 | Each of the steps in GPI fights in opposite directions, the policy improvement is pushing the policy to the greedy optimal solution, and by doing so, making the current value function invalid in some sense. The value function just corrects this invalidity, and by doing so, lets everybody know that the current policy is not optimal. For the techniques shown, given that there is a contraction mapping, and guaranteed convergence to the unique optimal solution, it could be argued that the value function space is convex. 186 | 187 | 188 |
189 |
190 | 191 | ### Copyright 192 | Copyright © 2023 Iván Belenky. This code is licensed under the MIT License. 193 | -------------------------------------------------------------------------------- /docs/MODELFREE.md: -------------------------------------------------------------------------------- 1 | # **Monte Carlo Methods** 2 | 3 | _"Monte Carlo methods utilize experience—sample sequences of states, actions, and rewards from actual or simulated interaction with an environment. Learning from actual experience is striking because it requires no prior knowledge of the environment’s dynamics, yet can still attain optimal behavior. Learning from simulated experience is also powerful. Although a model is required, the model need only generate sample transitions, not the complete probability distributions of all possible transitions that is required for dynamic programming (DP). In surprisingly many cases it is easy to generate experience sampled according to the desired probability distributions, but infeasible to obtain the distributions in explicit form."_ 4 | 5 | So basically if we have a universe in which we can sample stuff, we dont even have to bother with the model. If we want to simulate it, we must create a transition model, and therefore we leave out the complications of building the actual transition probability functions for each state action pair. It is fair to say that given this approach, it is mandatory to tae an episodic approach, otherwise there is no end to the endeavor. 6 | 7 | ## **Monte Carlo Prediction** 8 | 9 | The goal here is to learn the state-value function. The basic idea behind monte carlo is to average the value function across every episode we have at hand. Lets say that we have to find $\color{orange}v_{\pi}(s)$. Each occurence of state $\color{orange}s$ in an episode is called a _visit_ to $\color{orange}s$. 10 | 11 | ### **First visit Monte Carlo** 12 | 13 | First visit just averages all episodes after the _first visit to each state_ $\color{orange}s$ 14 | 15 | Given 16 | - Input: $\color{orange}\pi$ 17 | - Initialize 18 | - state-value: $\color{orange}v(s)$ 19 | - returns for each state: $\color{orange}R(s)$ 20 | - While some condition is true (tolerance, amount of iterations) 21 | - Generate an episode following policy $\color{orange}{\pi \rightarrow S_{0}, A_{0}, R_{1}, \cdots , S_{T-1}, A_{T-1}, R_{T}}$ 22 | - $\color{orange}G \leftarrow 0$ 23 | - loop `episode[::-1]`, i.e. $\color{orange}T-1, T-2, \cdots, 0$ 24 | - $\color{orange}G \leftarrow \gamma G + R_{t+1}$ 25 | - if state $\color{orange}s$ not in $\color{orange}[S_0, S_1, \cdots, S_{t-1} ]$: 26 | - $\color{orange} R(s) \leftarrow \text{append} \ G$ 27 | - $\color{orange} v_{\pi}(s) \leftarrow avg(R(s))$ 28 | 29 | 30 | The implementation is arbitrary. If it gets complicated with the backwards iteration, and the forward iteration, check the natural implementation following the basic principles of the algorithm, isomorphic. 31 | 32 | If a model is not available is particualrly useful to estimate action values, because otherwise it would be difficult to asses what is the best action to take given a state, not knowing what is the space of possible next states. In DP state values is all you want to know, since you have the modl. In MC you dont have the model, so you need to estimate the action values. 33 | 34 | Monte Carlo methods for estimating action values are exactly the same as the above, but instead of averaging the returns for each state, you average the returns for each state action pair. 35 | 36 | Major drawback: you may not visit all state action pairs. Edge example, if $\color{orange}\pi$ is deterministic. Exploration vs exploitation, and/or maintaining exploration. Solutions 37 | - `exploring starts`: make sure that all episodes start ant a specific state-action pair, and that every pair has a nonzero probability of being selected as the start 38 | - `stochastic selection of all possible action`: select sometimes a random action, remember epsilon policies with Multi Armed Bandits. 39 | 40 | ### **Monte Carlo Control** 41 | 42 | We can do basically the same as before: evaluate, improve, evaluate, improve, etc... but this time we are not able to make a policy greedy just by using value functions, since the model is lacking. Therefore we have a very similar picture as in DP but this time 43 | 44 | $$\color{orange}{ 45 | \pi_0 \overset{\mathbb{E}}{\longrightarrow} q_{\pi_0} \overset{\mathbb{I}}{\longrightarrow} \pi_1 \overset{\mathbb{E}}{\longrightarrow} \cdots \overset{\mathbb{I}}{\longrightarrow} \pi_{\*} \overset{\mathbb{E}}{\longrightarrow} q_{\pi_{\*}} 46 | } 47 | $$ 48 | 49 | so we are selecting the policy $\color{orange}\pi_{k+1}$ greedy with respect to $\color{orange}q_{\pi_{k}}$. 50 | 51 | 52 | We can replicate value iteration and policy iteration in some sense. That is we can try to estimate hardly the `q` function or we could also try to improve on an episode on episode basis. 53 | 54 | 55 | ### **Monte Carlo with Exploring Starts** 56 | 57 | Monte Carlo with **exploring starts** is the natural way to implement this idea 58 | 59 | **ES** 60 | 61 | - Input: $\color{orange}\pi$ 62 | - $\color{orange} G\leftarrow 0$ 63 | - $\color{orange} Q(s,a) \leftarrow q_0(s,a) \in \mathbb(R)$ 64 | - $\color{orange} R(s,a) \leftarrow \emptyset$ 65 | - Loop forever 66 | - generate episode $\color{orange}S_{0}, A_{0}, R_{1}, \cdots , S_{T-1}, A_{T-1}, R_{T}$ making sure that the first state action pair is selected randomly 67 | - loop `episode[::-1]` : `index:T--` 68 | - $\color{orange}G\leftarrow \gamma G + R_{t+1}$ 69 | - if `(s,a)` present not in episode `episode[:T]: 70 | - $\color{orange}R(s,a) \rightarrow \text{append} (G)$ 71 | - $\color{orange}Q(s,a) \rightarrow avg(R(s,a))$ 72 | - $\color{orange}\pi(s) \rightarrow \text{greedy} (Q(s,a))$ 73 | 74 | 75 | ### **Monte Carlo without Exploring Starts** 76 | 77 | There is a basic separation in policy improvement algorithms, and MC w/o Exploring Starts seems a nice way to introduce as Sutton does. On/Off policy. 78 | 79 | - On Policy uses the same policy as the one that is optimizing 80 | - Off Policy uses one policy to optimize and another one to _search_ or **generate the data**. 81 | 82 | This is like learning from someone else's experience vs our own. On the latter we must concentrate on knowing how good we are at the task, and try to navigate the behavior space in a way such that we maximize a goal, or minimize a cost. The experience to which we are going to be exposed while investigating it is going to be somewhat biased by our current one, so we are somewhat sensitive to local minima you would say. When learning from someone else (or elses) in principle we are not prisoners of our biased trajectories. But nevertheless we need to be able to say to a degree that the current actions are somewhat compatible with our history. We can basically explore but weighting the exploration on how useful it is to us. 83 | 84 | ES is somewhat not realizable since it may be the case that some state action pairs are never visited. 85 | 86 | - Input: $\color{orange}\pi$ 87 | - $\color{orange} G\leftarrow 0$ 88 | - $\color{orange} Q(s,a) \leftarrow q_0(s,a) \in \mathbb(R)$ 89 | - $\color{orange} R(s,a) \leftarrow \emptyset$ 90 | - Loop forever 91 | - generate episode $\color{orange}S_{0}, A_{0}, R_{1}, \cdots , S_{T-1}, A_{T-1}, R_{T}$ making sure that the first state action pair is selected randomly 92 | - loop `episode[::-1]` : `index:(t=T-1; t>-; t--)` 93 | - $\color{orange}G\leftarrow \gamma G + R_{t+1}$ 94 | - if `(s,a)` present not in episode `episode[:T]`: 95 | - $\color{orange}R(s,a) \rightarrow \text{append} (G)$ 96 | - $\color{orange}Q(s,a) \rightarrow avg(R(s,a))$ 97 | - $\color{orange}\pi(s) \rightarrow \text{greedy} (Q(s,a))$ 98 | - $\color{orange}\forall a \in \mathbin{A}(S_t)$ 99 | - if $\color{orange}a=\argmax_a Q(s,a) \rightarrow \pi(a|s) = 1-\varepsilon + \frac{\varepsilon}{|\mathbin{A}(S_t)}$ else $\color{orange}\pi(a|s) = \frac{\varepsilon}{|\mathbin{A}(S_t)}$ 100 | 101 | 102 | The above algorithm optimizes over the $\color{orange}\varepsilon-soft$ policies, described as to be policies that have non-zero probability of selecting any action under all possible states. That is we are optimizing over a modified transitional operator. This is a word in which sometimes noise kicks you out of what seems to be the best policy, so in turn you learn to optimize assuming that sometimes you may be kicked out from a local optimum. 103 | 104 | ### **Off-Policy Importance Sampling** 105 | 106 | How to mantain exploration and at the same time explore all possible actions, to find the potential better ones. The above algorithm implies a compromise, since we are optimizing over a near-optimal policy that still explores. So an alternative can be to have two policies 107 | - one that explores: `behavior policy` 108 | - one that gets optimized: `target policy` 109 | 110 | Off policy vs On policy tradeoffs 111 | - harder - simpler 112 | - more data - less data 113 | - more variance - less variance 114 | - more time for convergence - less time for convergence 115 | - general framework, superset that includes `on policy` - special case of `behavior = target` 116 | - learn from whatever you choose - learn from what you do 117 | 118 | **Importance sampling** comes into place here. This is just a technique for estimating expected values under one distribution, given samples from another one. The following example is quite intuitive 119 | 120 | $$ 121 | \color{orange}{ 122 | \mathbb{E}_{\sim p}(f) = \int f(x) p(x) dx = \mathbb{E}_{\sim q}(f \cdot p/q) = \int \frac{f(x)p(x)}{q(x)}q(x) dx 123 | } 124 | $$ 125 | 126 | So in essence what `IS` is doing doing is: weighting each point in probability space by a factor that is proportionate to how likely is to sample from $\color{orange}p$ instead of $\color{orange}q$. 127 | 128 | In the case f episodes or state-action trajectories, we get that the probability of obtaining a trajectory $\color{orange}A_t, S_{t+1}, \cdots, S_T$ under policy $\color{orange}\pi$ is 129 | 130 | $$ 131 | \color{orange}{ 132 | Pr\left\{ A_t, S_{t+1}, \cdots, S_T | S_t, A_{t:T-1} \sim \pi \right\} = \prod_{k=t}^{T-1}\pi(A_k|S_k) \cdot p(S_{k+1}|S_k, A_k) 133 | } 134 | $$ 135 | 136 | The same applies to any policy. If the behavior policy is $\color{orange}b$, the probability of obtaining a particuar trajectory is 137 | 138 | $$\color{orange}{ 139 | \prod_{k=t}^{T-1}b(A_k|S_k) \cdot p(S_{k+1}|S_k, A_k) 140 | } 141 | $$ 142 | 143 | even if the way the world transitions is hidden, i.e. how the world makes an update to the global state, we can 144 | 145 | $$\color{orange}{ 146 | \frac{\prod_{k=t}^{T-1}\pi(A_k|S_k) \cdot p(S_{k+1}|S_k, A_k)}{\prod_{k=t}^{T-1}b(A_k|S_k) \cdot p(S_{k+1}|S_k, A_k)} = \frac{\prod_{k=t}^{T-1}\pi(A_k|S_k)}{\prod_{k=t}^{T-1}b(A_k|S_k)} = \rho_{t:T-1} 147 | } 148 | $$ 149 | 150 | and is $\color{orange}\rho_{t:T-1}$ the weighting factor for importance sampling. After this all that is left to underttand is that 151 | - the probability space from which the samples are withdrawn corresponsd to the state-action trajectories 152 | - the expectation operator has to be applied to the returns $\color{orange}$ 153 | - returns are a mapping from trajectories to real numbers 154 | - expectation is going to be the mean of the returns 155 | 156 | Extra notation: 157 | - $\color{orange}\tau(s)$: set of time steps in which state $\color{orange}s$ was visited. For every visit this is. For first time, it would be the set of all time steps that were first visits to s within their respective episodes. 158 | - $\color{orange} T(t)$: index of the last time step in the episode belonging to the range $\color{orange}[t, T-1]$ 159 | - $\color{orange}G(t)$: return after t up through $\color{orange}T(t)$. 160 | 161 | we define then `ordinary importance sampling` as 162 | 163 | $$\color{orange}{ 164 | V(s) = \frac{\sum_{t\in \tau(s)} \rho_{t:T(t)-1}G_t}{|\tau(s)|} 165 | } 166 | $$ 167 | 168 | and `weighted importance sampling` as 169 | 170 | $$\color{orange}{ 171 | V(s) = \frac{\sum_{t\in \tau(s)} \rho_{t:T(t)-1}G_t}{\sum_{t\in \tau(s)} \rho_{t:T(t)-1}} 172 | } 173 | $$ 174 | 175 | basic differences between these two 176 | - first visit 177 | - `ordinary` is unbiased for first visit but can be extremely variant since the ratios are not bounded. 178 | - `weighted` is biased (although it converges to zero) and its variance is bounded by the maximum return. 179 | - every visit 180 | - `ordinary` biased but converges to zero 181 | - `weighted` biased but converges to zero 182 | 183 | Down below a nice example displays the convergence problems for ordinary importance samples. 184 | 185 | ![](/assets/images/infinite_variance.png) 186 | 187 |
188 | 189 | **First-Visit off policy evaluation naive implementation** 190 | 191 | - Input: $\color{orange}\pi$ 192 | - $\color{orange} Q(s,a) \leftarrow q_0(s,a) \in \mathbb(R)$ 193 | - $\color{orange} R(s,a) \leftarrow \emptyset$ 194 | - $\color{orange} \tau(s,a) \leftarrow \emptyset$ 195 | - $\color{orange} \rho(s,a) \leftarrow \emptyset$ 196 | - Loop some number of episodes 197 | - $\color{orange} b\leftarrow$ any policy with coverage of $\color{orange}\pi$ 198 | - $\color{orange} G\leftarrow 0$ 199 | - $\color{orange} W \leftarrow 1$ 200 | - generate episode with $\color{orange}b \rightarrow S_{0}, A_{0}, R_{1}, \cdots , S_{T-1}, A_{T-1}, R_{T}$ 201 | - loop `episode[::-1]` - `index:t => (t=T; t>0; t--)` 202 | - $\color{orange}G\leftarrow \gamma G + R_{t+1}$ 203 | - $\color{orange}W\leftarrow W \cdot \frac{\pi(a_t|s_t)}{b(a_t|s_t)}$ 204 | - if $\color{orange}W=0$ then break 205 | - if `(s,a)` not in episode `episode[:T]`: 206 | - $\color{orange}\tau(s,a) \leftarrow append(t)$ 207 | - $\color{orange}R(s,a) \leftarrow \text{append} (G)$ 208 | - $\color{orange}\rho(s,a) \leftarrow \text{append} (W)$ 209 | - $\color{orange} Q(s,a) \leftarrow \frac{\sum \rho R(s,a)}{\sum \rho}$ 210 | 211 | 212 |
213 | 214 | **Incremental implementation** 215 | 216 | The methods described in the multi armed bandits sections can easily be applied when implementing incremental versions of the montecarlo algorithms displayed in this notes. The one difference on the update rule corresponds to `weighted` averages since this is just not only dependent on count. 217 | 218 | Given a set of returns $\color{orange}{G}_k$ and a respective set of weights $\color{orange}\rho_k$ the weighted average is 219 | 220 | $$\color{orange}{ 221 | V_{K+1}(s) = \frac{\rho_k G_k}{\rho_{kk}} = \frac{\sum_{k=1}^K \rho_k G_k}{\sum_{k=1}^K \rho_k} 222 | } 223 | $$ 224 | 225 | therefore 226 | 227 | $$\color{orange}{ 228 | V_{K+1}(s) = \frac{\sum_{k}^{K-1}{\rho_k G_k} + \rho_K G_K}{\sum_{k}^{K} \rho_k} = V_K + \frac{\rho_K}{\sum_k^K \rho_k} (G_K - V_K ) 229 | } 230 | $$ 231 | 232 | and this can get implemented without the need to save list of returns and weights, just the last one. 233 | 234 |
235 | 236 | ### **Off Policy Control** 237 | 238 | This is the fun part. Using off-policy methods to do policy improvement. So lets enumerate the concepts and musts 239 | - `behavior` policy $\color{orange}b$ is going to generate the episodes. 240 | - `coverage` must be guaranteed 241 | - must be soft, i.e. $\color{orange}b(a|s)\geq 0 \ \forall s \in S, \ a \in A$ 242 | - `target` policy is the one that is going to be greedy with respect to the q function. 243 | 244 | **Off-policy MC control incremental implementation for finding $\color{orange}\pi \approx \pi^{*}$** 245 | 246 | - Intialize: 247 | - $\color{orange} Q(s,a) \leftarrow q_0(s,a) \in \mathbb(R)$ 248 | - $\color{orange} C(s,a) \leftarrow 0$ 249 | - $\color{orange} \pi(s) = \argmax_{a} Q(s,a)$ 250 | - Loop some number of episodes 251 | - $\color{orange} b\leftarrow$ any soft policy with coverage of $\color{orange}\pi$ 252 | - $\color{orange} G\leftarrow 0$ 253 | - $\color{orange} W \leftarrow 1$ 254 | - generate episode with $\color{orange}b \rightarrow S_{0}, A_{0}, R_{1}, \cdots , S_{T-1}, A_{T-1}, R_{T}$ 255 | - loop `episode[::-1]` - `index:t => (t=T-1; t>0; t--)` 256 | - $\color{orange}G\leftarrow \gamma G + R_{t+1}$ 257 | - $\color{orange}C(s,a) \leftarrow C(s,a) + W$ 258 | - $\color{orange}Q(s,a) \leftarrow Q(s,a) + \frac{W}{C(s,a)}(Q(s,a)-G)$ 259 | - $\color{orange}\pi(s,a) \leftarrow \argmax_{a}Q(s,a)$ 260 | - if $\color{orange}\pi(s_t)\neq a_t \rightarrow $ continue 261 | - $\color{orange}W \leftarrow W\cdot \frac{1}{b(a|s)}$ 262 | 263 | 264 |
265 | 266 | As a final remark is worth noting that there exists two other algorithms in the book, more specialized to reduce variance, and faster convergence. They do not provide a core conceptual understanding of the problem, and how Monte Carlo solves it. 267 | - `discounted aware importance sample` 268 | - `per-decision importance sample` 269 | 270 | 271 |
272 | 273 | 274 | 275 | 276 | ## Monte Carlo Tree Search MCTS 277 | 278 | MCTS is a Monte Carlo method used in planning and decision making. It balances the exploitation exploration issue. It can succeed even with little domain knowledge. 279 | 280 | The basic implementation is very simple. 281 | - A tree is built in an incremental and asymmetric manner. 282 | - For each iteration a _tree policy_ is used to find best node to expand. This is the policy that tries to balance exploration and exploitation. 283 | - Simulation is then run from the leaf node that was selected, and the node is updated accoridng to the result of this simulation (i.e. the reward and backpropagate statistics). There exists a _default policy_ specifies how to simulate from a given state. It can be really simple, uniform for instance. 284 | 285 | 286 | 2012 literature review paper quote: 287 | 288 | _However, it is really the success in computer Go, through the recursive application of Monte Carlo methods during the tree-building process, which has been responsible for much of the interest in MCTS. This is because Go is one of the few classic games for which human players are so far ahead of computer players. MCTS has had a dramatic effect on narrowing this gap, and is now competitive with the very best human players on small boards, though MCTS falls far short of their level on the standard 19⇥19 board._ 289 | 290 | 4 years later in 2016, MCTS was able to beat the world champion of Go, Lee Sedol. 291 | 292 | The algorithm consists as already stated on building a search tree until some predefined constraint has been reached. 293 | 294 | - Selection: from the root node, a child is selected by recursively applying a tree policy. If a nonterminal state that has not yet been visited is encountered. We halt. 295 | - Expansion: one (or more) child nodes are created and expand the tree, according to the available actions. 296 | - Simulation: a simulation is run from the expanded node, using a default policy to produce an outcome. 297 | - Backpropagation: through the selected nodes the statistics are updated. 298 | 299 | - _Tree Policy_: select/create leaf nodes to expand the tree. Selection and expansion. 300 | - _Default Policy_: play out the domain from a given non-terminal state. 301 | 302 | Finite Horizon, finite size MDPs, based on random episode sampling structured as a decision tree. There exists some requisites that are listed below for MCTS use. 303 | 304 | - state-action space must be finite 305 | - MDP must be finite horizon 306 | - MDP must be undiscounted i.e. $\color{orange}\gamma = 1$. 307 | 308 | 309 |
310 |
311 | 312 | ### Copyright 313 | Copyright © 2023 Iván Belenky. The code in this repository is licensed under the MIT License. 314 | All this notes correspond to Sutton's book, this is just a summary. 315 | -------------------------------------------------------------------------------- /examples/blackjack.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from rl.solvers.model_free import ( 4 | alpha_mc, 5 | off_policy_mc, 6 | tdn 7 | ) 8 | 9 | VALUES = ['A','2','3','4','5','6','7','8','9','10','J','Q','K'] 10 | SUITS = ['♠','♥','♦','♣'] 11 | CARDS = [(value,suit) for value in VALUES for suit in SUITS] 12 | 13 | states = [(i, False, dealer_showing) 14 | for i in range(4,21) 15 | for dealer_showing in VALUES] 16 | 17 | states += [(i, True, dealer_showing) 18 | for i in range(12,21) 19 | for dealer_showing in VALUES] 20 | 21 | actions = ['hit', 'stand'] 22 | 23 | def count(cards): 24 | counts = [0] 25 | for value in cards: 26 | if value in ['J','Q','K']: 27 | counts = [c+10 for c in counts] 28 | elif value == 'A': 29 | counts = [c+1 for c in counts] + [c+11 for c in counts] 30 | else: 31 | counts = [c+int(value) for c in counts] 32 | 33 | valid_counts = [c for c in counts if c <= 21] 34 | if len(valid_counts) == 0: 35 | return min(counts) 36 | return max(valid_counts) 37 | 38 | 39 | def black_jack_transition(state, action): 40 | player_sum, usable_ace, dealer_showing = state 41 | 42 | if action == 'hit' and player_sum < 21: 43 | new_card = random.choice(VALUES) 44 | if new_card == 'A': 45 | if player_sum + 11 > 21: 46 | card_value = 1 47 | else: 48 | card_value = 11 49 | usable_ace = True 50 | elif new_card in ['J','Q','K']: 51 | card_value = 10 52 | else: 53 | card_value = int(new_card) 54 | 55 | player_sum += card_value 56 | if usable_ace and player_sum > 21: 57 | player_sum -= 10 58 | usable_ace = False 59 | 60 | if player_sum > 21: 61 | return (state, -1.), True 62 | elif player_sum == 21: 63 | pass 64 | else: 65 | new_state = (player_sum, usable_ace, dealer_showing) 66 | return (new_state, 0.), False 67 | 68 | dealer_cards = [dealer_showing] 69 | dealer_sum = count(dealer_cards) 70 | if action == 'stand': 71 | dealer_plays = True 72 | while dealer_plays: 73 | dealer_sum = count(dealer_cards) 74 | if dealer_sum < 17: 75 | dealer_cards.append(random.choice(VALUES)) 76 | continue 77 | elif dealer_sum > 21: 78 | return (state, 1.), True 79 | elif 17 <= dealer_sum < 22: 80 | dealer_plays = False 81 | 82 | if dealer_sum > player_sum: 83 | return (state, -1.), True 84 | elif dealer_sum < player_sum: 85 | return (state, 1.), True 86 | elif dealer_sum == player_sum: 87 | return (state, 0.), True 88 | 89 | 90 | vqpi, samples = alpha_mc(states, actions, black_jack_transition, gamma=0.9, 91 | use_N=True, n_episodes=1E4, first_visit=False) 92 | -------------------------------------------------------------------------------- /examples/dyna_maze.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | plt.style.use('dark_background') 4 | 5 | from rl import dynaq, ModelFree 6 | 7 | GRID_HEIGHT, GRID_WIDTH = 6, 9 8 | START_XY, GOAL_XY = (0,3), (8,5) 9 | OBSTACLES = [(2,2), (2,3), (2,4), (5,1), (7,3), (7,4), (7,5)] 10 | 11 | states = [(x,y) for x in range(GRID_WIDTH) for y in range(GRID_HEIGHT) 12 | if (x,y) not in OBSTACLES] 13 | actions = ['left', 'right', 'up', 'down'] 14 | 15 | 16 | def obstacle_maze(state, action): 17 | x,y = state 18 | x_n, y_n = x, y 19 | 20 | if (x,y) == GOAL_XY: 21 | return (state, 1), True 22 | 23 | if action == 'left': 24 | x_n -= 1 25 | if action == 'right': 26 | x_n += 1 27 | if action == 'up': 28 | y_n += 1 29 | if action == 'down': 30 | y_n -= 1 31 | 32 | if x_n < 0 or x_n >= GRID_WIDTH: 33 | x_n = x 34 | if y_n < 0 or y_n >= GRID_HEIGHT: 35 | y_n = y 36 | if (x_n, y_n) in OBSTACLES: 37 | x_n, y_n = x, y 38 | 39 | state_n = (x_n, y_n) 40 | return (state_n, 0), False 41 | 42 | 43 | vqpi_0, samples_0 = dynaq(states, actions, obstacle_maze, START_XY, 44 | n_episodes=50, gamma=0.95, alpha=0.5, eps=0.1, n=0, max_steps=2E3) 45 | 46 | # plot found policy 47 | final_policy = samples_0[-1][-1] 48 | mf = ModelFree(states, actions, obstacle_maze, gamma=0.95, policy=final_policy) 49 | 50 | lrud = ['<', '>', '^', 'v'] 51 | pi = vqpi_0[2].pi 52 | 53 | plt.figure(figsize=(6,6)) 54 | for s, p in zip(states, pi): 55 | marker = lrud[np.argmax(p)] 56 | plt.scatter(s[0], s[1], c='red', marker=marker) 57 | 58 | for x,y in OBSTACLES: 59 | plt.scatter(x, y, c='white', marker='s') 60 | 61 | plt.xticks([]) 62 | plt.yticks([]) 63 | plt.show() 64 | 65 | 66 | # steps per episode function of N planning steps 67 | 68 | NS = [0, 5, 50] 69 | SMOOTH = 30 70 | 71 | model = ModelFree(states, actions, obstacle_maze,gamma=0.95) 72 | init_state = model.states.get_index(START_XY) 73 | 74 | all_steps_per_episode = [] 75 | for i in range(SMOOTH): 76 | vqpi_0, samples_0 = dynaq(states, actions, obstacle_maze, START_XY, 77 | n_episodes=50, gamma=0.95, alpha=0.1, eps=0.1, n=0, max_steps=1E4) 78 | vqpi_5, samples_5 = dynaq(states, actions, obstacle_maze, START_XY, 79 | n_episodes=50, gamma=0.95, alpha=0.1, eps=0.1, n=5, max_steps=1E4) 80 | vqpi_50, samples_50 = dynaq(states, actions, obstacle_maze, START_XY, 81 | n_episodes=50, gamma=0.95, alpha=0.1, eps=0.1, n=50, max_steps=1E4) 82 | 83 | steps_per_episode = [] 84 | for s0, s5, s50 in zip(samples_0, samples_5, samples_50): 85 | pi0, pi5, pi50 = s0[3], s5[3], s50[3] 86 | a0, a5, a50 = pi0(init_state), pi5(init_state), pi50(init_state) 87 | 88 | ep0 = model.generate_episode(START_XY, actions[a0] ,policy=pi0) 89 | ep5 = model.generate_episode(START_XY, actions[a5] ,policy=pi5) 90 | ep50 = model.generate_episode(START_XY, actions[a50] ,policy=pi50) 91 | 92 | steps_per_episode.append([len(ep0), len(ep5), len(ep50)]) 93 | all_steps_per_episode.append(steps_per_episode) 94 | 95 | 96 | steps_per_episode = np.mean(all_steps_per_episode, axis=0) 97 | 98 | mean_ep_steps = np.mean(all_steps_per_episode, axis=0) 99 | 100 | plt.figure(figsize=(6,6)) 101 | for i, n in enumerate(NS): 102 | plt.plot(mean_ep_steps[1:,i], linewidth=2, label='n={}'.format(n)) 103 | plt.legend(loc=1) 104 | plt.xlabel('Episode') 105 | plt.ylabel('Steps per episode') 106 | 107 | plt.show() -------------------------------------------------------------------------------- /examples/gridworld.py: -------------------------------------------------------------------------------- 1 | """ 2 | RL - Copyright © 2023 Iván Belenky @Leculette 3 | """ 4 | import sys 5 | 6 | import numpy as np 7 | 8 | from rl.mdp import MDP, TabularReward 9 | 10 | GRID_SIZE = 5 # 5x5 gridworld 11 | 12 | def main(): 13 | optimization_method = sys.argv[1] if len(sys.argv) > 1 else None 14 | if optimization_method is None: 15 | print("No optimization method specified") 16 | return 17 | 18 | actions = np.arange(4) # up, right, down, left 19 | states = np.arange(GRID_SIZE**2) 20 | p_s = np.zeros((GRID_SIZE**2, 4, GRID_SIZE**2)) 21 | 22 | #initialized the transition matrix 23 | for i in range(GRID_SIZE): 24 | for j in range(GRID_SIZE): 25 | state_idx = i*GRID_SIZE+j 26 | p_s[state_idx][0][max(i-1,0)*GRID_SIZE+j] = 1 27 | p_s[state_idx][1][i*GRID_SIZE+min(j+1,GRID_SIZE-1)] = 1 28 | p_s[state_idx][2][min(i+1,GRID_SIZE-1)*GRID_SIZE+j] = 1 29 | p_s[state_idx][3][i*GRID_SIZE+max(j-1,0)] = 1 30 | 31 | #rewrite probs for potential positions of teleport 32 | p_s[0][1] = np.zeros(GRID_SIZE**2) 33 | p_s[0][1][21] = 1 34 | 35 | p_s[2][3] = np.zeros(GRID_SIZE**2) 36 | p_s[2][3][21] = 1 37 | 38 | p_s[6][0] = np.zeros(GRID_SIZE**2) 39 | p_s[6][0][21] = 1 40 | 41 | p_s[2][1] = np.zeros(GRID_SIZE**2) 42 | p_s[2][1][13] = 1 43 | 44 | p_s[4][3] = np.zeros(GRID_SIZE**2) 45 | p_s[4][3][13] = 1 46 | 47 | p_s[8][0] = np.zeros(GRID_SIZE**2) 48 | p_s[8][0][13] = 1 49 | 50 | 51 | #by not specifying the policy we get a equal prob one 52 | #it is fair to notice that this init process is tedious for tabular MDPs 53 | 54 | #initializing reward, if we land on target 55 | r_sa = np.zeros((GRID_SIZE**2, 4)) 56 | 57 | #Border. 58 | #If it try to go out of the grid, it gets -1 reward 59 | for i in range(GRID_SIZE): 60 | r_sa[i][0] = -1 61 | r_sa[i*GRID_SIZE+GRID_SIZE-1][1] = -1 62 | r_sa[i*GRID_SIZE][3] = -1 63 | r_sa[GRID_SIZE*(GRID_SIZE-1)+i][2] = -1 64 | 65 | #A 66 | #If lands on (0,1) position it gets a reward of +10 67 | r_sa[0][1] += 10 68 | r_sa[2][3] += 10 69 | r_sa[6][0] += 10 70 | 71 | #B 72 | #If lands on (0,3) position it gets a reward of +5 73 | r_sa[2][1] += 5 74 | r_sa[4][3] += 5 75 | r_sa[8][0] += 5 76 | 77 | print("Reward matrix going up") 78 | print(r_sa[:,0].reshape(GRID_SIZE,GRID_SIZE)) 79 | print("Reward matrix going right") 80 | print(r_sa[:,1].reshape(GRID_SIZE,GRID_SIZE)) 81 | print("Reward matrix going down") 82 | print(r_sa[:,2].reshape(GRID_SIZE,GRID_SIZE)) 83 | print("Reward matrix going left") 84 | print(r_sa[:,3].reshape(GRID_SIZE,GRID_SIZE)) 85 | 86 | #Define the Markov Decision Process 87 | mdp = MDP(p_s, states, actions, gamma = 0.9, 88 | reward_gen=TabularReward(r_sa)) 89 | 90 | #calculate beforehand 91 | v, q = mdp.vq_pi() 92 | print("Value Function before optimizing") 93 | print(v.reshape(GRID_SIZE,GRID_SIZE)) 94 | print('-'*50) 95 | print("Q Function before optimizing") 96 | print(q.reshape(GRID_SIZE,GRID_SIZE,4)) 97 | print('\n') 98 | 99 | mdp.optimize_policy(method=optimization_method) 100 | v, q = mdp.vq_pi() 101 | print("Value Function after optimizing") 102 | print(v.reshape(GRID_SIZE,GRID_SIZE)) 103 | print('-'*50) 104 | print("Q Function after optimizing") 105 | print(q.reshape(GRID_SIZE,GRID_SIZE,4)) 106 | print('\n') 107 | 108 | print("Optimal policy up action") 109 | print(mdp.policy.pi_sa[:,0].reshape(GRID_SIZE, GRID_SIZE)) 110 | print("Optimal policy right action") 111 | print(mdp.policy.pi_sa[:,1].reshape(GRID_SIZE, GRID_SIZE)) 112 | print("Optimal policy down action") 113 | print(mdp.policy.pi_sa[:,2].reshape(GRID_SIZE, GRID_SIZE)) 114 | print("Optimal policy left action") 115 | print(mdp.policy.pi_sa[:,3].reshape(GRID_SIZE, GRID_SIZE)) 116 | 117 | 118 | if __name__ == '__main__': 119 | main() -------------------------------------------------------------------------------- /examples/mcts.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | plt.style.use("dark_background") 4 | 5 | from rl import mcts 6 | 7 | GRID_HEIGHT, GRID_WIDTH = 6, 9 8 | START_XY, GOAL_XY = (0,3), (8,5) 9 | OBSTACLES = [(2,2), (2,3), (2,4), (5,1), (7,3), (7,4), (7,5)] 10 | 11 | states = [(x,y) for x in range(GRID_WIDTH) for y in range(GRID_HEIGHT) 12 | if (x,y) not in OBSTACLES] 13 | actions = ['left', 'right', 'up', 'down'] 14 | 15 | 16 | def obstacle_maze(state, action): 17 | x,y = state 18 | x_n, y_n = x, y 19 | 20 | reward = -0.05 21 | if action == 'left': 22 | x_n -= 1 23 | if action == 'right': 24 | x_n += 1 25 | if action == 'up': 26 | y_n += 1 27 | if action == 'down': 28 | y_n -= 1 29 | 30 | if x_n < 0 or x_n >= GRID_WIDTH: 31 | x_n = x 32 | if y_n < 0 or y_n >= GRID_HEIGHT: 33 | y_n = y 34 | if (x_n, y_n) in OBSTACLES: 35 | x_n, y_n = x, y 36 | 37 | state_n = (x_n, y_n) 38 | if state_n == GOAL_XY: 39 | return (state_n, 1), True 40 | return (state_n, reward), False 41 | 42 | def action_map(state): 43 | possible_actions = [] 44 | for a in actions: 45 | (s, _), _ = obstacle_maze(state, a) 46 | if s != state: 47 | possible_actions.append(a) 48 | return possible_actions 49 | 50 | 51 | 52 | if __name__ == "__main__": 53 | s = START_XY 54 | end = False 55 | tree = None 56 | while not end: 57 | action, _ = mcts(s, 0.0, 500, obstacle_maze, action_map, 25, eps=1) 58 | print(s, action) 59 | (s, _), end = obstacle_maze(s, action) 60 | 61 | tree.plot() -------------------------------------------------------------------------------- /examples/mountain_car.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | plt.style.use("dark_background") 4 | 5 | from rl import semigrad_tdn, gradient_mc, IHT, tiles 6 | from rl.approximators import LinearApproximator 7 | 8 | 9 | ACTIONS = (-1, 0, 1) 10 | X_BOUND = [-1.2, 0.5] 11 | V_BOUND = [-0.07, 0.07] 12 | 13 | 14 | def mountain_car(state, action): 15 | x, v = state 16 | new_v = v + 0.001*action - 0.0025*np.cos(3*x) 17 | new_x = x + new_v 18 | 19 | if new_x < X_BOUND[0]: 20 | new_x = X_BOUND[0] 21 | new_v = 0 22 | return ((new_x, new_v), -1), False 23 | elif new_x > X_BOUND[1]: 24 | return (state, 10), True 25 | else: 26 | new_v = np.clip(new_v, V_BOUND[0], V_BOUND[1]) 27 | return ((new_x, new_v), -1), False 28 | 29 | 30 | def state_generator(): 31 | x = np.random.uniform(X_BOUND[0], X_BOUND[1]) 32 | v = np.random.uniform(V_BOUND[0], V_BOUND[1]) 33 | return (x, v) 34 | 35 | 36 | iht_s = IHT(1000) 37 | iht_sa = IHT(4096) 38 | 39 | 40 | def state_action_aggregator(sa): 41 | s, a = sa 42 | x, v = s 43 | f = np.zeros(4096) 44 | tile = tiles(iht_sa, 8, [8*x/(0.5+1.2), 8*v/(0.07+0.07)], [a]) 45 | f[tile] = 1 46 | return f 47 | 48 | 49 | def state_aggregator(state): 50 | x, v = state 51 | f = np.zeros(1000) 52 | tile = tiles(iht_s, 8, [8*x/(0.5+1.2), v/(0.07+0.07)]) 53 | f[tile] = 1 54 | return f 55 | 56 | 57 | if __name__ == "__main__": 58 | vhat = LinearApproximator(fs=1000, basis=state_aggregator) 59 | qhat = LinearApproximator(fs=4096, basis=state_action_aggregator) 60 | 61 | vqpi_mc, samples_mc = gradient_mc(mountain_car, state_generator, ACTIONS, 62 | vhat, q_hat=qhat, state_0=(0,0), action_0=0, n_episodes=500, 63 | max_steps=1E4, alpha=0.1/8, eps=0.1, optimize=True) 64 | -------------------------------------------------------------------------------- /examples/random_walk.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rl import tdn, alpha_mc 4 | 5 | states = [1,2,3,4,5] 6 | actions = ['?'] #there are no actions :D 7 | 8 | def random_walk(state, action): 9 | go_right = np.random.random() > 0.5 10 | if go_right: 11 | if 1+state <= 5: 12 | return (1+state, 0), False 13 | return (state, 1), True 14 | else: 15 | if state-1 == 0: 16 | return (state, 0), True 17 | return (state-1, 0), False 18 | 19 | 20 | _, samples_mc_01 = alpha_mc(states, actions, random_walk, alpha=0.01, 21 | first_visit=True, n_episodes=200) 22 | 23 | # ... -------------------------------------------------------------------------------- /examples/short_corridor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | plt.style.use('dark_background') 4 | 5 | from tqdm import tqdm 6 | 7 | from rl import reinforce_mc 8 | from rl.approximators import ModelFreeTL, LinearApproximator 9 | 10 | actions = ['left', 'right'] 11 | 12 | def short_corridor(state, action): 13 | go_right = (action == 'right') 14 | if state == 1: 15 | if go_right: 16 | return (2, -1), False 17 | return (1, 0), False 18 | if state == 2: 19 | if go_right: 20 | return (1, -1), False 21 | return (3, -1), False 22 | if state == 3: 23 | if go_right: 24 | return (state, 0), True 25 | return (2, -1), False 26 | 27 | 28 | def random_state(): 29 | return np.random.randint(1,4) 30 | 31 | 32 | def state_action_aggregator(sa): 33 | _, a = sa 34 | right = (a == 'right') 35 | if right: 36 | return np.array([1., 0.]) 37 | return np.array([0., 1.]) 38 | 39 | 40 | if __name__ == "__main__": 41 | pi_hat = LinearApproximator(fs=2, basis=state_action_aggregator) 42 | pi_hat.w = np.array([-1.47, 1.47]) 43 | 44 | pi, samples = reinforce_mc(short_corridor, random_state, pi_hat, actions, state_0=1, alpha=2E-4, 45 | gamma=1, n_episodes=1000, max_steps=1000, samples=100, tol=1/np.inf) 46 | model = ModelFreeTL(short_corridor, random_state, pi, gamma=1) 47 | 48 | SMOOTH = 100 49 | rewards = [] 50 | for i in tqdm(range(SMOOTH)): 51 | _rewards = [] 52 | for policy in samples: 53 | a0 = policy(1) 54 | episode = model.generate_episode(1, a0, policy=policy, max_steps=100) 55 | sar = np.array(episode) 56 | _rewards.append(sar[:,2].astype(int).sum()) 57 | rewards.append(_rewards) 58 | 59 | plt.plot(np.array(rewards).mean(axis=0)) 60 | plt.show() 61 | -------------------------------------------------------------------------------- /examples/single_state.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | plt.style.use("dark_background") 4 | 5 | from rl import tdn, alpha_mc, off_policy_mc, ModelFreePolicy 6 | 7 | # Define model free 8 | states = [0] 9 | actions = ['left', 'right'] 10 | 11 | def single_state_transition(state, action): 12 | if action == 'right': 13 | return (state, 0), True 14 | if action == 'left': 15 | threshold = np.random.random() 16 | if threshold > 0.9: 17 | return (state, 1), True 18 | else: 19 | return (state, 0), False 20 | 21 | b = ModelFreePolicy(actions, states) #by default 1 half 22 | pi = ModelFreePolicy(actions, states) 23 | pi.pi[0] = np.array([1, 0]) 24 | 25 | 26 | # calculate ordinary and weighted samples state value functions 27 | vqpi_ord, samples_ord = off_policy_mc(states, actions, single_state_transition, 28 | policy=pi, b=b, ordinary=True, first_visit=True, gamma=1., n_episodes=1E4) 29 | 30 | vqpi_w, samples_w = off_policy_mc(states, actions, single_state_transition, 31 | policy=pi, b=b, ordinary=False, first_visit=True, gamma=1., n_episodes=1E4) 32 | 33 | 34 | #Plot! 35 | vords = [v[1].values()[0] for v in samples_ord[1:]] 36 | vw = [v[1].values()[0] for v in samples_w[1:]] 37 | idxs = [v[0] for v in samples_ord[1:]] 38 | 39 | plt.figure(figsize=(10, 5)) 40 | plt.plot(idxs, vords, label='Ordinary Importance Sampling') 41 | plt.plot(idxs, vw, label='Weighted Importance Sampling') 42 | plt.xlabel('No episodes') 43 | plt.ylabel('v(0)') 44 | plt.xscale('log') 45 | plt.legend(loc=1) 46 | plt.show() -------------------------------------------------------------------------------- /examples/state_aggregation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | plt.style.use("dark_background") 4 | 5 | from rl import gradient_mc, semigrad_tdn 6 | from rl.approximators import LinearApproximator 7 | 8 | 9 | states = [(i//101,i) for i in range(1001)] 10 | actions = ['?'] #there are no actions :D 11 | 12 | 13 | def random_walk(state, action): 14 | group, pos = state 15 | go_right = np.random.random() > 0.5 16 | steps = np.random.randint(1,100) 17 | 18 | if go_right: 19 | if pos+steps <= 1000: 20 | new_pos = pos+steps 21 | new_group = new_pos//101 22 | return ((new_group, new_pos), 0), False 23 | return (state, 1), True 24 | else: 25 | if pos-steps > 0: 26 | new_pos = pos-steps 27 | new_group = new_pos//101 28 | return ((new_group, new_pos), 0), False 29 | return (state, -1), True 30 | 31 | 32 | def state_aggregator(state): 33 | group, _ = state 34 | x = np.zeros(10) 35 | x[group] = 1 36 | return x 37 | 38 | 39 | def state_generator(): 40 | pos = np.random.randint(1,1000) 41 | group = pos//101 42 | return (group, pos) 43 | 44 | 45 | if __name__ == "__main__": 46 | approximator_mc = LinearApproximator(k=10, fs=10, basis=state_aggregator) 47 | approximator_td = LinearApproximator(k=10, fs=10, basis=state_aggregator) 48 | 49 | vqpi_mc, samples_mc = gradient_mc(random_walk, state_generator, actions, 50 | approximator_mc, n_episodes=3E4, max_steps=1E5, 51 | alpha=2*10E-5) 52 | vqpi_td, samples_td = semigrad_tdn(random_walk, state_generator, actions, 53 | approximator_td, n_episodes=3E4, max_steps=1E5, 54 | alpha=2*10E-5) 55 | 56 | vhat_mc = vqpi_mc[0] 57 | vhat_td = vqpi_td[0] 58 | 59 | state_sample =[(pos//101, pos) for pos in np.arange(1001)] 60 | vpi_true = 2/1000*np.arange(1001) - 1 61 | vpi_mc = np.array([vhat_mc(s) for s in state_sample]) 62 | vpi_td = np.array([vhat_td(s) for s in state_sample]) 63 | 64 | plt.figure(figsize=(10,5)) 65 | plt.plot(vpi_true, label='True value') 66 | plt.plot(vpi_td, label='semigrad-tdn') 67 | plt.plot(vpi_mc, label='gradient-mc') 68 | plt.legend(loc=4) -------------------------------------------------------------------------------- /examples/windy_gridworld.py: -------------------------------------------------------------------------------- 1 | from rl import ( 2 | tdn, 3 | ModelFree, 4 | EpsilonSoftPolicy 5 | ) 6 | 7 | GRID_HEIGHT = 7 8 | GRID_WIDTH = 10 9 | WIND_WEIGHT_X = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0] 10 | GOAL_XY = (7, 3) 11 | 12 | states = [(j,i) for i in range(GRID_HEIGHT) for j in range(GRID_WIDTH)] 13 | actions = ['left', 'right', 'up', 'down'] 14 | 15 | def windy_grid_world(state, action): 16 | x, y = state 17 | if state == GOAL_XY: 18 | return (state, 1), True 19 | 20 | reward = 0 21 | if action == 'left': 22 | x = x-1 23 | y = y + WIND_WEIGHT_X[max(x, 0)] 24 | if action == 'right': 25 | x = x+1 26 | y = y + WIND_WEIGHT_X[min(x, GRID_WIDTH-1)] 27 | if action == 'up': 28 | y = y + 1 + WIND_WEIGHT_X[x] 29 | if action == 'down': 30 | y = y - 1 + WIND_WEIGHT_X[x] 31 | 32 | if x < 0: 33 | x = 0 34 | reward -= 1 35 | if x >= GRID_WIDTH: 36 | x = GRID_WIDTH-1 37 | reward -= 1 38 | if y < 0: 39 | y = 0 40 | reward -= 1 41 | if y >= GRID_HEIGHT: 42 | y = GRID_HEIGHT-1 43 | reward -= 1 44 | 45 | return ((x, y), reward), False 46 | 47 | vqpi, samples = tdn(states, actions, windy_grid_world, (0,3), 'right', 48 | gamma=1, n=1, alpha=0.5, eps=0.1, n_episodes=175, max_steps=3000, optimize=True) 49 | 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | asttokens==2.2.1 2 | backcall==0.2.0 3 | certifi==2022.9.24 4 | charset-normalizer==2.1.1 5 | click==8.1.3 6 | click-default-group==1.2.2 7 | cloup==0.13.1 8 | colour==0.1.5 9 | comm==0.1.2 10 | commonmark==0.9.1 11 | contourpy==1.0.6 12 | cycler==0.11.0 13 | debugpy==1.6.6 14 | decorator==5.1.1 15 | executing==1.2.0 16 | fonttools==4.38.0 17 | glcontext==2.3.7 18 | idna==3.4 19 | importlib-metadata==6.0.0 20 | ipykernel==6.21.1 21 | ipython==8.9.0 22 | isosurfaces==0.1.0 23 | jedi==0.18.2 24 | jupyter_client==8.0.2 25 | jupyter_core==5.2.0 26 | kiwisolver==1.4.4 27 | mapbox-earcut==0.12.11 28 | matplotlib==3.6.1 29 | matplotlib-inline==0.1.6 30 | moderngl==5.7.0 31 | moderngl-window==2.4.2 32 | multipledispatch==0.6.0 33 | nest-asyncio==1.5.6 34 | networkx==2.8.8 35 | numpy==1.23.4 36 | packaging==21.3 37 | parso==0.8.3 38 | pexpect==4.8.0 39 | pickleshare==0.7.5 40 | Pillow==9.3.0 41 | platformdirs==2.6.2 42 | prompt-toolkit==3.0.36 43 | psutil==5.9.4 44 | ptyprocess==0.7.0 45 | pure-eval==0.2.2 46 | pydub==0.25.1 47 | pyglet==2.0.0 48 | Pygments==2.13.0 49 | pyparsing==3.0.9 50 | pyrr==0.10.3 51 | python-dateutil==2.8.2 52 | pyzmq==25.0.0 53 | requests==2.28.1 54 | rich==12.6.0 55 | rl==0.0.0 56 | scipy==1.9.3 57 | screeninfo==0.8.1 58 | six==1.16.0 59 | skia-pathops==0.7.3 60 | srt==3.5.2 61 | stack-data==0.6.2 62 | tornado==6.2 63 | tqdm==4.64.1 64 | traitlets==5.9.0 65 | typing_extensions==4.4.0 66 | urllib3==1.26.12 67 | watchdog==2.1.9 68 | wcwidth==0.2.6 69 | zipp==3.12.0 70 | -------------------------------------------------------------------------------- /rl/__init__.py: -------------------------------------------------------------------------------- 1 | from .model_free import ( 2 | ModelFree, 3 | ModelFreePolicy, 4 | EpsilonSoftPolicy 5 | ) 6 | from .solvers.model_based import ( 7 | vq_pi_iter_naive, 8 | value_iteration, 9 | policy_iteration 10 | ) 11 | from .solvers.model_free import ( 12 | alpha_mc, 13 | tdn, 14 | off_policy_mc, 15 | n_tree_backup 16 | ) 17 | from .solvers.planning import ( 18 | dynaq, 19 | priosweep, 20 | t_sampling, 21 | mcts, 22 | rtdp, 23 | ) 24 | from .solvers.approx import ( 25 | gradient_mc, 26 | semigrad_tdn, 27 | lstd, 28 | semigrad_td_lambda, 29 | diff_semigradn, 30 | reinforce_mc 31 | ) 32 | from .tiles import IHT, tiles 33 | 34 | from .utils import TransitionException 35 | 36 | __all__ = [ 37 | 'ModelFree', 38 | 'ModelFreePolicy', 39 | 'EpsilonSoftPolicy', 40 | 'TransitionException', 41 | 'vq_pi_iter_naive', 42 | 'value_iteration', 43 | 'policy_iteration', 44 | 'alpha_mc', 45 | 'tdn', 46 | 'off_policy_mc', 47 | 'n_tree_backup', 48 | 'dynaq', 49 | 'priosweep', 50 | 't_sampling', 51 | 'mcts', 52 | 'rtdp', 53 | 'gradient_mc', 54 | 'semigrad_tdn', 55 | 'lstd', 56 | 'semigrad_td_lambda', 57 | 'diff_semigradn', 58 | 'reinforce_mc', 59 | 'Tile', 60 | 'tiles' 61 | ] -------------------------------------------------------------------------------- /rl/approximators.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import time 3 | from time import perf_counter 4 | from abc import ABC, abstractmethod 5 | from typing import ( 6 | Optional, 7 | Callable, 8 | Tuple, 9 | Callable, 10 | Sequence, 11 | Union, 12 | List, 13 | Any 14 | ) 15 | 16 | import numpy as np 17 | 18 | from rl.utils import ( 19 | Policy, 20 | Transition, 21 | TransitionException, 22 | EpisodeStep, 23 | W_INIT, 24 | MAX_ITER, 25 | MAX_STEPS 26 | ) 27 | 28 | ''' 29 | All of this may change if the policy gradient methods are 30 | similar to this implementation. 31 | 32 | SGD and Semi Gradient Linear methods: 33 | 34 | All Linear methods of this methods involve using a real value 35 | weight matrix/vector that will be used in conjunction with a 36 | basis function to approximate the value function. 37 | 38 | wt+1 = wt - 1/2 * alpha * d[(v_pi - v_pi_hat)^2]/dw 39 | wt+1 = wt + alpha * (v_pi - v_pi_hat)]*d[v_pi_hat]/dw 40 | wt+1 = wt + alpha * (U - v_pi_hat)]*d[v_pi_hat]/dw 41 | 42 | Since we dont have v_pi we have to use some estimator: 43 | - MC would imply grabbing full trajectories and using them 44 | - TD since involves bootstraping (it will be a semigradient method). 45 | 46 | Therefore we generate the most abstract SGD method. The two most 47 | important parts of this methods is the U approximation to the real 48 | value, and the value function approximator, this class should be 49 | differentiable, or hold a gradient method. 50 | ''' 51 | 52 | 53 | class Approximator(ABC): 54 | '''Approximator base class that implements caster methods 55 | as well as defining the basic interface of any approximator. 56 | It has to be updateable and callable. Updatability implies 57 | that it can change its inner attributes and hopefully learn. 58 | ''' 59 | @abstractmethod 60 | def __call__(self, s: Any, *args, **kwargs) -> float: 61 | '''Return the value of the approximation''' 62 | raise NotImplementedError 63 | 64 | @abstractmethod 65 | def update(self, *args, **kwargs) -> Union[None, np.ndarray]: 66 | '''Update the approximator''' 67 | raise NotImplementedError 68 | 69 | def copy(self, *args, **kwargs) -> Any: 70 | '''Return a copy of the approximator''' 71 | return copy.deepcopy(self) 72 | 73 | def is_differentiable(self): 74 | grad = getattr(self, "grad", None) 75 | if grad: 76 | return True 77 | return False 78 | 79 | class ModelFreeTLPolicy(Policy): 80 | '''ModelFreeTLPolicy is for approximated methods what 81 | ModelFreePolicy is for tabular methods. 82 | 83 | This policies are thought with tabular actions in mind, since 84 | the problem of continuous action spaces are a topic of ongoing 85 | research and not yet standardized. For each a in the action-space A 86 | there will exist an approximator. 87 | ''' 88 | def __init__(self, actions: Sequence[Any], q_hat: Approximator): 89 | self.actions = actions 90 | self.A = len(actions) 91 | self.q_hat = q_hat 92 | 93 | def update_policy(self, *args, **kwargs): 94 | self.q_hat.update(*args, **kwargs) 95 | 96 | def __call__(self, state: Any): 97 | action_idx = np.argmax([self.q_hat((state, a)) for a in self.actions]) 98 | return self.actions[action_idx] 99 | 100 | 101 | class EpsSoftSALPolicy(ModelFreeTLPolicy): 102 | def __init__(self, actions: Sequence[Any], q_hat: Approximator, 103 | eps: float = 0.1): 104 | super().__init__(actions, q_hat) 105 | self.eps = eps 106 | 107 | def __call__(self, state): 108 | if np.random.rand() < self.eps: 109 | return np.random.choice(self.actions) 110 | return super().__call__(state) 111 | 112 | 113 | class REINFORCEPolicy(ModelFreeTLPolicy): 114 | def __init__(self, actions: Sequence[Any], pi_hat: Approximator): 115 | '''Must be a differential approximator''' 116 | self.actions = actions 117 | self.pi_hat = pi_hat 118 | if not self.pi_hat.is_differentiable(): 119 | raise TypeError("Policy approximator pi_hat must be differentiable") 120 | 121 | def grad_lnpi(self, s, a): 122 | pi_sa = self.pi_sa(s).reshape(-1, 1) 123 | grad_pi_sa = self.pi_hat.grad((s, a)).reshape(-1, 1) 124 | grads_pi_sa = np.array([self.pi_hat.grad((s, a_i)) for a_i in self.actions]) 125 | return (grad_pi_sa - grads_pi_sa @ pi_sa).reshape(-1) 126 | 127 | def update_policy(self, c: float, s: Any, a: Any): 128 | self.pi_hat.w += c*self.grad_lnpi(s, a) 129 | 130 | def pi_sa(self, s: Any) -> np.ndarray: 131 | pi_hat_sa = [self.pi_hat((s, a)) for a in self.actions] 132 | max_sa = max(pi_hat_sa) 133 | e_hsa = [np.exp(pi_hat_sa[i] - max_sa) for i in range(len(self.actions))] 134 | denom = sum(e_hsa) 135 | pi_sa = np.array([e_hsa[i]/denom for i in range(len(self.actions))]) 136 | return pi_sa 137 | 138 | def __call__(self, s: Any) -> float: 139 | '''default softmax implementation''' 140 | return np.random.choice(self.actions, p=self.pi_sa(s)) 141 | 142 | 143 | class ModelFreeTL: 144 | ''' 145 | ModelFreeTL stands for Model Free Tabular Less, even if we have state, 146 | to approximate methods what ModelFree is to tabular ones. 147 | 148 | ModelFreeTL is used mostly internally for the seek of readability 149 | on solvers, but can be used standalone as well. The usual case 150 | for this is when you want to generate arbitrary episodes for a 151 | specific environment. This class will stand in between of the 152 | user implemented transitions and the solvers. In difference with 153 | tabular ModelFree there is no room for validation previous to 154 | runtime executions. 155 | ''' 156 | 157 | def __init__(self, transition: Transition, rand_state: Callable, 158 | policy: ModelFreeTLPolicy, gamma: float = 1): 159 | self.policy = policy 160 | self.rand_state = rand_state 161 | self.transition = transition 162 | self.gamma = gamma 163 | self._validate_transition() 164 | 165 | def _validate_transition(self): 166 | start = perf_counter() 167 | while perf_counter() - start < 2: 168 | rand_s = self.rand_state() 169 | rand_a = np.random.choice(self.policy.actions) 170 | try: 171 | self.transition(rand_s, rand_a) 172 | except Exception as e: 173 | raise TransitionException( 174 | f'Transition function is not valid: {e}') 175 | 176 | def random_sa(self): 177 | a = np.random.choice(self.policy.actions) 178 | s = self.rand_state() 179 | return s, a 180 | 181 | def generate_episode(self, 182 | s_0: Any, 183 | a_0: Any, 184 | policy: ModelFreeTLPolicy=None, 185 | max_steps: int=MAX_STEPS) -> List[EpisodeStep]: 186 | '''Generate an episode using given policy if any, otherwise 187 | use the one defined as the attribute''' 188 | policy = policy if policy else self.policy 189 | episode = [] 190 | end = False 191 | step = 0 192 | s_t_1, a_t_1 = s_0, a_0 193 | while (end != True) and (step < max_steps): 194 | (s_t, r_t), end = self.transition(s_t_1, a_t_1) 195 | episode.append((s_t_1, a_t_1, r_t)) 196 | a_t = policy(s_t) 197 | s_t_1, a_t_1 = s_t, a_t 198 | step += 1 199 | 200 | return episode 201 | 202 | def step_transition(self, state: Any, action: Any 203 | ) -> Tuple[Tuple[Any, float], bool]: 204 | return self.transition(state, action) 205 | 206 | 207 | class SGDWA(Approximator): 208 | '''Stochastic Gradient Descent Weight-Vector Approximator 209 | for MSVE (mean square value error). 210 | 211 | Differentiable Value Function approximator dependent 212 | on a weight vector. Must define a gradient method. Thought 213 | to be less of a general case and more oriented toward the 214 | mean square value error VE, the prediction objective. 215 | ''' 216 | def __init__(self, 217 | fs:int=None, 218 | basis: Optional[Callable[[Any], np.ndarray]]=None): 219 | ''' 220 | Parameters 221 | ---------- 222 | fs: int 223 | feature shape, i.e. dimensionality of the function basis 224 | basis: Callable[[Any], np.ndarray], optional 225 | function basis defaults to identity. If not specified the 226 | signature must be Callable[[np.ndarray], np.ndarray] otherwise 227 | it will be probably fail miserably. 228 | ''' 229 | self.fs = fs 230 | self.basis_name = basis.__name__ 231 | self.basis = basis if basis else lambda x: x 232 | self.w = np.ones(self.fs)*W_INIT 233 | 234 | def grad(self, x: Any) -> np.ndarray: 235 | '''Return the gradient of the approximation''' 236 | return self.basis(x) 237 | 238 | def delta_w(self, U: float, alpha: float, x: Any, g: np.ndarray) -> np.ndarray: 239 | '''g: vector value, either gradient or elegibility trace''' 240 | return alpha * (U - self(x)) * g 241 | 242 | def et_update(self, U: float, alpha: float, x: Any, z: np.ndarray) -> np.ndarray: 243 | '''Updates inplace with elegibility traces the weight vector''' 244 | dw = self.delta_w(U, alpha, x, z) 245 | self.w = self.w + dw 246 | return dw 247 | 248 | def update(self, U: float, alpha: float, x: Any) -> np.ndarray: 249 | '''Updates inplace the weight vector and returns update just in case''' 250 | dw = self.delta_w(U, alpha, x, self.grad(x)) 251 | self.w = self.w + dw 252 | return dw 253 | 254 | def __call__(self, x): 255 | return np.dot(self.w, self.basis(x)) 256 | 257 | 258 | LinearApproximator = SGDWA 259 | -------------------------------------------------------------------------------- /rl/armed_bandits.py: -------------------------------------------------------------------------------- 1 | """ 2 | RL - Copyright © 2023 Iván Belenky @Leculette 3 | """ 4 | 5 | from typing import List 6 | 7 | import numpy as np 8 | import numpy.random as rnd 9 | 10 | from rl.utils import Policy, RewardGenerator 11 | 12 | 13 | GAUSSIAN = [RewardGenerator('normal', rnd.random(), rnd.random()) for _ in range(10)] 14 | NGAMES = 1 15 | NSTEPS = 1000 16 | 17 | 18 | class EpsilonGreedyBanditPolicy(Policy): 19 | def __init__(self, k: int=10, epsilon: float=0.1, offset: float=0.0): 20 | self.k = k 21 | self.eps = epsilon 22 | self.offset = offset 23 | self.q_values = np.zeros(k) + self.offset 24 | self.N = np.zeros(k) 25 | 26 | def __call__(self) -> int: 27 | if rnd.random() < self.eps: 28 | return rnd.randint(self.k) 29 | 30 | return np.argmax(self.q_values) 31 | 32 | def update_policy(self, action: int, reward: float) -> None: 33 | N = self.N[action] + 1 34 | self.N[action] = N 35 | 36 | Q = self.q_values[action] 37 | R = reward 38 | Qnew = Q + 1/N*(R-Q) 39 | 40 | self.q_values[action] = Qnew 41 | 42 | 43 | class UCBPolicy(Policy): 44 | def __init__(self, k: int=10, c: float=2.0, offset: float=0.0): 45 | self.k = k 46 | self.c = c 47 | self.offset = offset 48 | self.q_values = np.zeros(k) + self.offset 49 | self.N = np.zeros(k) 50 | self.init_counter = 0 51 | 52 | def __call__(self): 53 | if self.init_counter < self.k: 54 | action_index = self.init_counter 55 | self.init_counter += 1 56 | return action_index 57 | 58 | return np.argmax( 59 | self.q_values + self.c*np.sqrt(np.log(np.sum(self.N))/self.N)) 60 | 61 | def update_policy(self, action, reward): 62 | N = self.N[action] + 1 63 | self.N[action] = N 64 | 65 | Q = self.q_values[action] 66 | R = reward 67 | Qnew = Q + 1/N*(R-Q) 68 | 69 | self.q_values[action] = Qnew 70 | 71 | 72 | class AlphaEpsilonGreedyBanditPolicy(EpsilonGreedyBanditPolicy): 73 | def __init__(self, k: int=10, epsilon: int=0.1, alpha: int=0.1): 74 | super().__init__(k, epsilon) 75 | self.alpha = alpha 76 | 77 | def update_policy(self, action, reward): 78 | Q = self.q_values[action] 79 | R = reward 80 | Qnew = Q + self.alpha*(R-Q) 81 | 82 | self.q_values[action] = Qnew 83 | 84 | 85 | class GradientPolicy(Policy): 86 | def __init__(self, k: int=10, alpha: float=0.1): 87 | self.k = k 88 | self.alpha = alpha 89 | self.rewards = [] 90 | self.H = np.zeros(k) 91 | self.Pr = np.zeros(k) 92 | 93 | def __call__(self) -> int: 94 | self.Pr = np.exp(self.H)/np.sum(np.exp(self.H)) 95 | return np.random.choice(self.k, p=self.Pr) 96 | 97 | def update_policy(self, action, reward) -> None: 98 | self.H -= self.alpha*(reward - np.mean(self.rewards))*self.Pr 99 | self.H[action] += self.alpha*(reward - np.mean(self.rewards)) 100 | self.rewards.append(reward) 101 | 102 | 103 | EGREEDY = EpsilonGreedyBanditPolicy() 104 | 105 | 106 | class MultiArmedBandit: 107 | def __init__( 108 | self, 109 | k: int=10, 110 | reward_generators: List[RewardGenerator]=GAUSSIAN, 111 | n_games: int=NGAMES, 112 | policy: Policy=EGREEDY): 113 | 114 | self.k = k 115 | self.reward_generators = reward_generators 116 | self.N = n_games 117 | self.histories = [] 118 | self.reward_history = [] 119 | self.action_history = [] 120 | self.policy = policy 121 | self.ground_truth = np.argmax([ 122 | rg.mean() for rg in self.reward_generators]) 123 | 124 | def step(self, action: int) -> float: 125 | reward = self.reward_generators[action].generate() 126 | self.reward_history.append(reward) 127 | self.action_history.append(action) 128 | 129 | return reward 130 | 131 | def reset(self) -> None: 132 | self.action_history = [] 133 | self.reward_history = [] 134 | 135 | def evaluate_policy(self) -> List[float]: 136 | for _ in range(self.N): 137 | self.step(self.policy()) 138 | 139 | return self.reward_history 140 | 141 | def update_policy(self) -> None: 142 | for _ in range(self.N): 143 | action = self.policy() 144 | reward = self.step(action) 145 | self.policy.update_policy(action, reward) 146 | 147 | def best_action_percentage(self) -> None: 148 | ah = np.array(self.action_history) 149 | n = ah[ah==self.ground_truth] 150 | return n.shape[0]/ah.shape[0] -------------------------------------------------------------------------------- /rl/mdp.py: -------------------------------------------------------------------------------- 1 | '''RL Copyright © 2023 Iván Belenky''' 2 | 3 | from typing import Tuple, List 4 | from abc import ABC, abstractmethod 5 | 6 | import numpy as np 7 | 8 | from rl.utils import Policy, RewardGenerator 9 | from rl.solvers.model_based import ( 10 | vq_pi_iter_naive, 11 | policy_iteration, 12 | value_iteration 13 | ) 14 | 15 | PROB_TOL = 1E-3 16 | ESTIMATE_ITERS = int(1E3) 17 | 18 | 19 | class MarkovReward(ABC): 20 | @abstractmethod 21 | def generate(self, state: int, action: int) -> float: 22 | raise NotImplementedError 23 | 24 | @abstractmethod 25 | def r_sas(self, next_state: int) -> float: 26 | ''' 27 | r(s,a,s') = E[Rt|St-1 = s, At-1 = a, St = s'] 28 | ''' 29 | 30 | raise NotImplementedError 31 | 32 | def r_sa(self, p_s: np.ndarray, state: int, action: int): 33 | ''' 34 | r(s,a) = E[Rt|St-1 = s, At-1 = a] 35 | ''' 36 | p = p_s[state][action] 37 | r = 0 38 | for i,ps in enumerate(p): 39 | r += ps*self.mean(state=self.states[i]) 40 | return r 41 | 42 | 43 | class TabularReward(MarkovReward): 44 | ''' 45 | Tabular reward implements as the name suggests a reward 46 | per state and action. The reward is a matrix of size SxA. 47 | This type of reward is used in the case that the world 48 | in which the agent conducts gives you fixed rewards for 49 | taking action: a at state: s. 50 | ''' 51 | 52 | def __init__( 53 | self, 54 | r_sa: np.ndarray, 55 | ): 56 | self.states, self.actions = r_sa.shape 57 | self._r_sa = r_sa 58 | 59 | def generate(self, state: int = 0, action: int = 0) -> float: 60 | return self._r_sa[state][action] 61 | 62 | def r_sa(self, p_s: np.ndarray, state: int, action: int): 63 | return self._r_sa[state][action] 64 | 65 | def r_sas(self, next_state: int) -> float: 66 | return np.mean(self._r_sa[next_state]) 67 | 68 | 69 | 70 | class MarkovPolicy(Policy): 71 | ''' 72 | Markov Policy is a policy that is defined by a matrix of size SxA. 73 | This class admits a policy defined by the user or a equally probable 74 | policy will be created. 75 | 76 | The policy matrix π(a|s) must be a matrix of size SxA where each row 77 | represents the probability of taking action a at state s. Therefore 78 | each row must sum to 1 within the specified tolerance 1E-3. 79 | ''' 80 | 81 | def __init__(self, pi_sa: np.ndarray = None, s: int = None, a:int = None): 82 | ''' 83 | pi_sa: policy matrix 84 | s: number of states 85 | a: number of actions 86 | 87 | pi_sa and s and a are mutually exclusive. If pi_sa is provided then 88 | s and a are ignored. If pi_sa is not provided then s and a must be 89 | provided. 90 | ''' 91 | if not pi_sa and not (s or a): 92 | raise ValueError("Either pi_sa or s and a must be provided") 93 | 94 | if pi_sa: 95 | self.pi_sa = pi_sa 96 | self.s, self.a = self.pi_sa.shape 97 | self._validate_attr() 98 | else: 99 | self.s = s 100 | self.a = a 101 | #equal probable policy 102 | self.pi_sa = np.ones((self.s, self.a))/self.a 103 | 104 | def _validate_attr(self): 105 | if not np.allclose(self.pi_sa.sum(axis=1), 1, atol=PROB_TOL): 106 | raise ValueError("Each row must sum to 1") 107 | 108 | def update_policy(self, q_pi: np.ndarray): 109 | ''' 110 | Updates the policy based on the Q function: for each state s 111 | the action a that maximizes Q(s,a) is selected. If there are 112 | multiple actions that maximize Q(s,a) then the policy is 113 | updated to be equally probable among those actions. 114 | ''' 115 | self.pi_sa = np.array([self._update_policy(q_pi, s) 116 | for s in range(self.s)]) 117 | 118 | def _update_policy(self, q_pi: np.ndarray, state: int) -> np.ndarray: 119 | q_sa = q_pi.T[state] 120 | max_q = max(q_sa) 121 | max_q_sa = np.array([q_sa[a] == max_q for a in range(self.a)]) 122 | return max_q_sa / sum(max_q_sa) 123 | 124 | def π(self, state: int): 125 | ''' 126 | π(a|s=state) 127 | ''' 128 | return self.pi_sa[state] 129 | 130 | def __call__(self, state: int) -> np.ndarray: 131 | ''' 132 | Collapses the policy to a single action, i.e. a sample from the 133 | random variable that represents the policy. 134 | ''' 135 | return np.random.choice(self.pi_sa[state], p=self.pi_sa[state]) 136 | 137 | 138 | class MDP: 139 | VQ_PI_SOLVERS = { 140 | 'iter_n': vq_pi_iter_naive 141 | } 142 | 143 | OPTIMAL_POLICY_SOLVERS = { 144 | 'policy_iteration' : policy_iteration, 145 | 'value_iteration' : value_iteration, 146 | } 147 | 148 | def __init__( 149 | self, 150 | p_s: np.ndarray, 151 | states: np.ndarray, 152 | actions: np.ndarray, 153 | gamma: float = 0.9, 154 | policy: Policy = None, 155 | reward_gen: RewardGenerator = None, 156 | ): 157 | self.p_s = p_s 158 | self.states = states 159 | self.actions = actions 160 | self.gamma = gamma 161 | self.reward_gen = reward_gen 162 | self.history = [] 163 | self._validate_attr() 164 | 165 | self.S = self.states.shape[0] 166 | self.A = self.actions.shape[0] 167 | self.policy = policy if policy else MarkovPolicy(s=self.S, a=self.A) 168 | 169 | @property 170 | def cum_return(self) -> float: 171 | return np.sum([r for _, r in self.history]) 172 | 173 | @property 174 | def discounted_return(self) -> float: 175 | return np.sum( 176 | [r*(self.gamma**i) for i,(_, r) in enumerate(self.history)]) 177 | 178 | def _validate_attr(self): 179 | S = self.states.shape[0] 180 | A = self.actions.shape[0] 181 | if self.p_s.shape != (S, A, S): 182 | raise ValueError( 183 | "p_s must be of shape " + 184 | f"(n_states, n_actions, n_states) = ({S}, {A}, {S})") 185 | 186 | for i in range(S): 187 | if not np.allclose(self.p_s[i].sum(axis=1), 1, atol=PROB_TOL): 188 | raise ValueError("Each row must sum to 1") 189 | 190 | if self.gamma > 1 or self.gamma < 0: 191 | raise ValueError( 192 | f"discounted rate gamma has to be in range [0, 1]") 193 | 194 | def r_sa(self, state: int, action: int) -> float: 195 | return self.reward_gen.r_sa(self.p_s, state, action) 196 | 197 | def r_sas(self, next_s: int) -> float: 198 | return self.reward_gen.r_sas(next_s) 199 | 200 | def pi_sa(self, state: int) -> np.ndarray: 201 | return self.policy.pi_sa(state) 202 | 203 | def vq_pi( 204 | self, 205 | policy: MarkovPolicy = None, 206 | method: str = 'iter_n' 207 | ) -> np.ndarray: 208 | ''' 209 | Individual state value functions and action-value functions 210 | vpi and qpi cannot be calculated for bigger problems. That 211 | constraint will give rise to parametrizations via DL. 212 | ''' 213 | policy = policy if policy else self.policy 214 | solver = self.VQ_PI_SOLVERS.get(method) 215 | if not solver: 216 | raise ValueError(f"Method {method} does not exist") 217 | 218 | return solver(self, policy) 219 | 220 | def optimize_policy( 221 | self, 222 | method: str = 'policy_iteration', 223 | policy: MarkovPolicy = None 224 | ) -> MarkovPolicy: 225 | ''' 226 | Optimal policy is the policy that maximizes the expected 227 | discounted return. It is the policy that maximizes the 228 | value function for each possible state. 229 | ''' 230 | policy = policy if policy else self.policy 231 | solver = self.OPTIMAL_POLICY_SOLVERS.get(method) 232 | if not solver: 233 | raise ValueError(f"Method {method} does not exist") 234 | 235 | solver(self, policy) 236 | 237 | 238 | def __call__(self, state: int = 0) -> Tuple[int, float]: 239 | p = self.p_s[state][self.policy(state)] 240 | next_state = np.random.choice(self.states, p=p) 241 | self.curr_state = next_state 242 | reward = self.reward_gen.generate(next_state) 243 | 244 | self.history.append((self.curr_state, reward)) 245 | 246 | return next_state, reward -------------------------------------------------------------------------------- /rl/model_free.py: -------------------------------------------------------------------------------- 1 | ''' 2 | RL - Copyright © 2023 Iván Belenky @Leculette 3 | ''' 4 | 5 | from typing import ( 6 | Tuple, 7 | Union, 8 | Sequence, 9 | Callable, 10 | List, 11 | Any, 12 | NewType, 13 | ) 14 | 15 | import numpy as np 16 | 17 | from rl.utils import ( 18 | Policy, 19 | State, 20 | Action, 21 | StateAction, 22 | TransitionException, 23 | EpisodeStep, 24 | MAX_ITER, 25 | MAX_STEPS 26 | ) 27 | 28 | class ModelFreePolicy(Policy): 29 | def __init__(self, A: Union[Sequence[Any], int], S: Union[Sequence[Any], int]): 30 | if not isinstance(A, int): 31 | A = len(A) 32 | if not isinstance(S, int): 33 | S = len(S) 34 | self.A = A 35 | self.S = S 36 | self.pi = np.ones((S, A))/A 37 | 38 | def __call__(self, state: int): 39 | return np.random.choice(self.A, p=self.pi[state]) 40 | 41 | def pi_as(self, action: int, state: int): 42 | return self.pi[state, action] 43 | 44 | def update_policy(self, q, s): 45 | qs_mask = (q[s] == np.max(q[s])) 46 | self.pi[s] = np.where(qs_mask, 1.0/qs_mask.sum(), 0) 47 | 48 | def _make_deterministic(self): 49 | self.pi = np.eye(self.A)[np.argmax(self.pi, axis=1)] 50 | 51 | 52 | class EpsilonSoftPolicy(ModelFreePolicy): 53 | def __init__(self, A, S, eps): 54 | super().__init__(A, S) 55 | self.Ɛ = eps 56 | 57 | def update_policy(self, q, s): 58 | # if there are multiple actions with the same value, 59 | # then we choose one of them randomly 60 | max_q = np.max(q[s]) 61 | qs_mask = (q[s] == max_q) 62 | self.pi[s] = self.Ɛ/self.A 63 | self.pi[s, qs_mask] += (1 - self.Ɛ)/qs_mask.sum() 64 | 65 | 66 | class ModelFree: 67 | ''' 68 | ModelFree is the base holder of the states, actions, and 69 | the transition defining an environment. 70 | 71 | ModelFree is used mostly internally for the seek of readability 72 | on solvers, but can be used standalone as well. The usual case 73 | for this is when you want to generate arbitrary episodes of a 74 | specific environment. This class will stand in between of the 75 | user implemented transitions and validate its correct behavior. 76 | ''' 77 | 78 | def __init__(self, states: Sequence[Any], actions: Sequence[Any], 79 | transition: Callable, gamma: float = 1, policy: ModelFreePolicy = None 80 | ): 81 | 82 | self.policy = policy 83 | self.states = State(states) 84 | self.actions = Action(actions) 85 | self.stateaction = StateAction( 86 | [(s,a) for s,a in zip(states, actions)]) 87 | self.transition = transition 88 | self.gamma = gamma 89 | self.policy = policy if policy else ModelFreePolicy( 90 | self.actions.N, self.states.N) 91 | 92 | self._validate_transition() 93 | 94 | def init_vq(self): 95 | v = np.zeros(self.states.N) 96 | q = np.zeros((self.states.N, self.actions.N)) 97 | return v,q 98 | 99 | def random_sa(self, value=False): 100 | s = self.states.random(value) 101 | a = self.actions.random(value) 102 | return s, a 103 | 104 | def _to_index(self, state, action): 105 | state = self.states.get_index(state) 106 | action = self.actions.get_index(action) 107 | 108 | return state, action 109 | 110 | def _validate_transition(self): 111 | states = self.states.seq 112 | actions = self.actions.seq 113 | sa = [(s,a) for s in states for a in actions] 114 | 115 | success, fail_count = True, 0 116 | for s, a in sa: 117 | try: 118 | self.__validate_transition(s, a) 119 | except Exception as e: 120 | success = False 121 | fail_count += 1 122 | print(f"Warning: {e}") # TODO: change to logger 123 | 124 | if not success: 125 | raise TransitionException( 126 | f"Transition failed for {fail_count} state-action pairs") 127 | 128 | def __validate_transition(self, state: Any, action: Any, 129 | ) -> Tuple[Tuple[Any, Union[float, int]], bool]: 130 | 131 | try: 132 | (s, r), end = self.transition(state, action) 133 | except Exception as e: 134 | raise TransitionException(f"Transition method failed: {e}") 135 | 136 | if not isinstance(end, bool) or not isinstance(r, (float, int)): 137 | raise TransitionException( 138 | "Transition method must return (Any, float), bool" 139 | f" instead of ({type(s)}, {type(r)}), {type(end)}" 140 | ) 141 | try: 142 | self.states.get_index(s) 143 | self.states.get_index(state) 144 | self.actions.get_index(action) 145 | except Exception as e: 146 | raise TransitionException( 147 | f"Undeclared state or action in transition method: {e}") 148 | 149 | return (s, r), end 150 | 151 | def generate_episode(self, s_0: Any, a_0: Any, policy: ModelFreePolicy = None, 152 | max_steps: int=MAX_STEPS) -> List[EpisodeStep]: 153 | 154 | policy = policy if policy else self.policy 155 | 156 | episode = [] 157 | end = False 158 | step = 0 159 | s_t_1, a_t_1 = s_0, a_0 160 | while (end != True) and (step < max_steps): 161 | (s_t, r_t), end = self.transition(s_t_1, a_t_1) 162 | (_s, _a), _r = self._to_index(s_t_1, a_t_1), r_t 163 | episode.append((_s, _a, _r)) 164 | a_t = policy(self.states.get_index(s_t)) 165 | s_t_1, a_t_1 = s_t, self.actions.from_index(a_t) 166 | 167 | step += 1 168 | 169 | return episode 170 | 171 | def step_transition(self, state: int, action: int 172 | ) -> Tuple[Tuple[int, float], bool]: 173 | 174 | s, a = self.states.from_index(state), self.actions.from_index(action) 175 | (s_t, r_t), end = self.transition(s, a) 176 | s_new = self.states.get_index(s_t) 177 | return (s_new, r_t), end -------------------------------------------------------------------------------- /rl/solvers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanbelenky/RL/00136148ae5679245fc50623fb5e8fcb072e60dc/rl/solvers/__init__.py -------------------------------------------------------------------------------- /rl/solvers/approx.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractclassmethod 2 | from copy import deepcopy 3 | from typing import ( 4 | Sequence, 5 | Callable, 6 | Tuple, 7 | Optional, 8 | List, 9 | Any, 10 | NewType 11 | ) 12 | 13 | import numpy as np 14 | from numpy.linalg import norm as lnorm 15 | from tqdm import tqdm 16 | 17 | from rl.approximators import ( 18 | Approximator, 19 | SGDWA, 20 | ModelFreeTL, 21 | ModelFreeTLPolicy, 22 | EpsSoftSALPolicy, 23 | REINFORCEPolicy 24 | ) 25 | from rl.utils import ( 26 | _typecheck_all, 27 | _get_sample_step, 28 | _check_ranges, 29 | Samples, 30 | Transition, 31 | MAX_ITER, 32 | MAX_STEPS, 33 | TOL 34 | ) 35 | 36 | class AVQPi: 37 | def __init__(self, v: Approximator, q: Approximator, pi: ModelFreeTLPolicy): 38 | self.v_hat = v 39 | self.q = q 40 | self.pi = pi 41 | 42 | 43 | def get_sample(v_hat, q_hat, π, n_episode, optimize): 44 | _idx = n_episode 45 | _v = v_hat.copy() 46 | _q = None 47 | _pi = None 48 | if optimize: 49 | _pi = deepcopy(π) 50 | _q = q_hat.copy() 51 | return (_idx, _v, _q, _pi) 52 | 53 | 54 | def _set_s0_a0(MFS, s, a): 55 | s_0, a_0 = MFS.random_sa() 56 | s_0 = s_0 if not s else s 57 | a_0 = a_0 if not a else a 58 | return s_0, a_0 59 | 60 | 61 | def onehot_q_hat(v_hat, actions): 62 | '''V(s) function approximator to Q(s,a) function approximator''' 63 | A = len(actions) 64 | onehot_actions = {a:np.zeros(A-1) for a in actions} 65 | for a in range(A-1): 66 | onehot_actions[a][a] = 1 67 | 68 | def new_basis(sa): 69 | s, a = sa 70 | b_s = v_hat.basis(s) 71 | a = onehot_actions[a] 72 | b_sa = np.append(b_s, a) 73 | return b_sa 74 | 75 | fs = v_hat.fs + A - 1 76 | basis = new_basis 77 | 78 | q_hat = v_hat.__class__(fs, basis) 79 | return q_hat 80 | 81 | 82 | def _set_policy(policy, eps, actions, v_hat, q_hat): 83 | if not policy: 84 | if not q_hat: 85 | q_hat = onehot_q_hat(v_hat, actions) 86 | if eps: 87 | _typecheck_all(constants=[eps]) 88 | _check_ranges(values=[eps], ranges=[(0,1)]) 89 | policy = EpsSoftSALPolicy(actions, q_hat, eps=eps) 90 | else: 91 | policy = ModelFreeTLPolicy(actions, q_hat) 92 | return policy 93 | 94 | 95 | def gradient_mc(transition: Transition, 96 | random_state: Callable[[Any], Any], 97 | actions: Sequence[Any], 98 | v_hat: SGDWA, 99 | q_hat: SGDWA=None, 100 | state_0: Any=None, 101 | action_0: Any=None, 102 | alpha: float=0.05, 103 | gamma: float=1.0, 104 | n_episodes: int=MAX_ITER, 105 | max_steps: int=MAX_STEPS, 106 | samples: int=1000, 107 | optimize: bool=False, 108 | policy: ModelFreeTLPolicy=None, 109 | tol: float=TOL, 110 | eps: float=None) -> Tuple[AVQPi, Samples]: 111 | '''Gradient α-MC algorithm for estimating, and optimizing policies 112 | 113 | gradient_mc uses the gradient of VE to estimate the value of 114 | a state given a policy. The work behind estimation runs is to 115 | the training process of the value function approximator with MC 116 | estimates. It can also optimize the policies themselves. 117 | 118 | Parameters 119 | ---------- 120 | transition : Callable[[Any,Any],[[Any,float], bool]]] 121 | transition must be a callable function that takes as arguments the 122 | (state, action) and returns (new_state, reward), end. 123 | random_state : Callable[[Any], Any] 124 | random state generator 125 | actions : Sequence[Any] 126 | Sequence of possible actions 127 | v_hat : SGDWA 128 | Function approximator to use for the state value function 129 | q_hat: SGDWA, optional 130 | Function approximator to use for the action-value function, by default None 131 | and will be replaced by a mocked version of q_hat where a one hot 132 | encoding for the actions is going to get appended to the state vector. 133 | state_0 : Any, optional 134 | Initial state, by default None (random) 135 | action_0 : Any, optional 136 | Initial action, by default None (random) 137 | alpha : float, optional 138 | Learning rate, by default 0.1 139 | gamma : float, optional 140 | Discount factor, by default 0.9 141 | n_episodes : int, optional 142 | Number of episodes to simulate, by default 1E4 143 | max_steps : int, optional 144 | Maximum number of steps per episode, by default 1E3 145 | samples : int, optional 146 | Number of samples to take, by default 1000 147 | optimize : bool, optional 148 | Whether to optimize the policy or not, by default False 149 | policy : ModelFreePolicy, optional 150 | Policy to use, by default equal probability ModelFreePolicy 151 | tol : float, optional 152 | Tolerance for estimating convergence estimations 153 | eps : float, optional 154 | Epsilon value for the epsilon-soft policy, by default None (no exploration) 155 | 156 | Returns 157 | ------- 158 | vqpi : Tuple[VPi, QPi, Policy] 159 | Value function, action-value function, policy and samples if any. 160 | samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 161 | Samples taken during the simulation if any. The first element is the 162 | index of the iteration, the second is the value function, the third is 163 | the action-value function and the fourth is the TODO:. 164 | 165 | Raises 166 | ------ 167 | TransitionError: If any of the arguments is not of the correct type. 168 | ''' 169 | 170 | policy = _set_policy(policy, eps, actions, v_hat, q_hat) 171 | 172 | _typecheck_all(transition=transition, 173 | constants=[gamma, alpha, n_episodes, max_steps, samples, tol], 174 | booleans=[optimize], policies=[policy]) 175 | 176 | _check_ranges(values=[gamma, alpha, n_episodes, max_steps, samples], 177 | ranges=[(0,1), (0,1), (1,np.inf), (1,np.inf), (1,1001)]) 178 | 179 | sample_step = _get_sample_step(samples, n_episodes) 180 | 181 | model = ModelFreeTL(transition, random_state, policy, gamma=gamma) 182 | vh, qh, samples = _gradient_mc(model, v_hat, state_0, action_0, 183 | alpha, int(n_episodes), int(max_steps), tol, optimize, sample_step) 184 | 185 | return AVQPi(vh, qh, policy), samples 186 | 187 | 188 | def _gradient_mc(MFS, v_hat, s_0, a_0, alpha, n_episodes, 189 | max_steps, tol, optimize, sample_step): 190 | 191 | α, γ, π = alpha, MFS.gamma, MFS.policy 192 | q_hat = π.q_hat 193 | 194 | samples, dnorm = [], TOL*2 195 | for n_episode in tqdm(range(n_episodes), desc=f'grad-MC', unit='episodes'): 196 | if dnorm < tol: 197 | break 198 | s_0, a_0 = _set_s0_a0(MFS, s_0, a_0) 199 | 200 | episode = MFS.generate_episode(s_0, a_0, π, max_steps) 201 | w_old = v_hat.w.copy() 202 | 203 | G = 0 204 | for s_t, a_t, r_tt in episode[::-1]: 205 | G = γ*G + r_tt 206 | v_hat.update(G, α, s_t) 207 | 208 | if optimize: 209 | q_hat.update(G, α, (s_t, a_t)) 210 | 211 | dnorm = lnorm(w_old - v_hat.w) 212 | 213 | if sample_step and n_episode % sample_step == 0: 214 | samples.append(get_sample(v_hat, q_hat, π, n_episode, optimize)) 215 | 216 | return v_hat, q_hat, samples 217 | 218 | 219 | def semigrad_tdn(transition: Transition, 220 | random_state: Callable[[Any], Any], 221 | actions: Sequence[Any], 222 | v_hat: SGDWA, 223 | q_hat: SGDWA=None, 224 | state_0: Any=None, 225 | action_0: Any=None, 226 | alpha: float=0.05, 227 | n: int=1, 228 | gamma: float=1.0, 229 | n_episodes: int=MAX_ITER, 230 | max_steps: int=MAX_STEPS, 231 | samples: int=1000, 232 | optimize: bool=False, 233 | policy: ModelFreeTLPolicy=None, 234 | tol: float=TOL, 235 | eps: float=None) -> Tuple[AVQPi, Samples]: 236 | '''Semi-Gradient n-step Temporal Difference 237 | 238 | Solver for the n-step temporal difference algorithm. The algorithm is 239 | semi-gradient in the sense that it uses a function approximator to 240 | estimate the _true_ value function. If optimize is set, since no 241 | encoding of the action into the feature basis is done, the algorithm 242 | will optimize the policy making one approximator per action. Naive, 243 | and cost-innefective 244 | 245 | Parameters 246 | ---------- 247 | transition : Callable[[Any,Any],[[Any,float], bool]]] 248 | transition must be a callable function that takes as arguments the 249 | (state, action) and returns (new_state, reward), end. 250 | random_state : Callable[[Any], Any] 251 | random state generator 252 | v_hat : SGDWA 253 | Function approximator to use for the state value function 254 | q_hat: SGDWA, optional 255 | Function approximator to use for the action-value function, by default None 256 | and will be replaced by a mocked version of q_hat where a one hot 257 | encoding is going to get appended to the state vector. 258 | actions: Sequence[Any] 259 | Sequence of possible actions 260 | state_0 : Any, optional 261 | Initial state, by default None (random) 262 | action_0 : Any, optional 263 | Initial action, by default None (random) 264 | alpha : float, optional 265 | Learning rate, by default 0.1 266 | n : int, optional 267 | Number of steps to look ahead, by default 1 268 | gamma : float, optional 269 | Discount factor, by default 0.9 270 | n_episodes : int, optional 271 | Number of episodes to simulate, by default 1E4 272 | max_steps : int, optional 273 | Maximum number of steps per episode, by default 1E3 274 | samples : int, optional 275 | Number of samples to take, by default 1000 276 | optimize : bool, optional 277 | Whether to optimize the policy or not, by default False 278 | policy : ModelFreePolicy, optional 279 | Policy to use, by default equal probability ModelFreePolicy 280 | tol : float, optional 281 | Tolerance for estimating convergence estimations 282 | eps : float, optional 283 | Epsilon value for the epsilon-soft policy, by default None (no exploration) 284 | 285 | Returns 286 | ------- 287 | vqpi : Tuple[VPi, QPi, Policy] 288 | Value function, action-value function, policy and samples if any. 289 | samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 290 | Samples taken during the simulation if any. The first element is the 291 | index of the iteration, the second is the value function, the third is 292 | the action-value function and the fourth is the TODO:. 293 | 294 | Raises 295 | ------ 296 | TransitionError: If any of the arguments is not of the correct type. 297 | ''' 298 | policy = _set_policy(policy, eps, actions, v_hat, q_hat) 299 | 300 | _typecheck_all(transition=transition, 301 | constants=[gamma, alpha, n_episodes, max_steps, samples, tol, n], 302 | booleans=[optimize], policies=[policy]) 303 | 304 | _check_ranges(values=[gamma, alpha, n_episodes, max_steps, samples, n], 305 | ranges=[(0,1), (0,1), (1,np.inf), (1,np.inf), (1,1001), (1, np.inf)]) 306 | 307 | sample_step = _get_sample_step(samples, n_episodes) 308 | 309 | model = ModelFreeTL(transition, random_state, policy, gamma=gamma) 310 | v, q, samples = _semigrad_tdn(model, v_hat, state_0, action_0, 311 | alpha, n, int(n_episodes), int(max_steps), tol, optimize, sample_step) 312 | 313 | return AVQPi(v, q, policy), samples 314 | 315 | 316 | def _semigrad_tdn(MFS, v_hat, s_0, a_0, alpha, n, n_episodes, max_steps, 317 | tol, optimize, sample_step): 318 | '''Semi gradient n-step temporal difference 319 | 320 | DRY but clear. 321 | ''' 322 | 323 | α, γ, π = alpha, MFS.gamma, MFS.policy 324 | gammatron = np.array([γ**i for i in range(n)]) 325 | q_hat = π.q_hat 326 | 327 | samples, dnorm = [], TOL*2 328 | for n_episode in tqdm(range(n_episodes), desc=f'semigrad-TD', unit='episodes'): 329 | if dnorm < tol: 330 | break 331 | s, a = _set_s0_a0(MFS, s_0, a_0) 332 | 333 | w_old = v_hat.w.copy() 334 | 335 | T = int(max_steps) 336 | R, A, S, G = [], [a], [s], 0 337 | for t in range(T): 338 | if t < T: 339 | (s, r), end = MFS.step_transition(s, a) 340 | R.append(r) 341 | S.append(s) 342 | if end: 343 | T = t + 1 344 | else: 345 | a = π(s) 346 | A.append(a) 347 | 348 | tau = t - n + 1 349 | if tau >= 0: 350 | rr = np.array(R[tau:min(tau+n, T)]) 351 | G = gammatron[:rr.shape[0]].dot(rr) 352 | G_v, G_q = G, G 353 | if tau + n < T: 354 | G_v = G_v + γ**n * v_hat(S[tau+n]) 355 | G_q = G_q + γ**n * q_hat((S[tau+n], A[tau+n])) 356 | 357 | s_t = S[tau] 358 | a_t = A[tau] 359 | 360 | v_hat.update(G_v, α, s_t) 361 | 362 | if optimize: 363 | q_hat.update(G_q, α, (s_t, a_t)) 364 | 365 | if tau == T - 1: 366 | break 367 | 368 | dnorm = lnorm(w_old - v_hat.w) 369 | 370 | if n_episode % sample_step == 0: 371 | samples.append(get_sample(v_hat, q_hat, π, n_episode, optimize)) 372 | n_episode += 1 373 | 374 | return v_hat, q_hat, samples 375 | 376 | 377 | # TODO: policy setting and optimize 378 | def lstd(transition: Transition, 379 | random_state: Callable[[Any], Any], 380 | state_0: Any=None, 381 | action_0: Any=None, 382 | alpha: float=0.05, 383 | gamma: float=1.0, 384 | n_episodes: int=MAX_ITER, 385 | max_steps: int=MAX_STEPS, 386 | samples: int=1000, 387 | optimize: bool=False, 388 | policy: ModelFreeTLPolicy=None, 389 | tol: float=TOL, eps: float=None) -> Tuple[AVQPi, Samples]: 390 | '''Least squares n-step temporal differnece 391 | 392 | Parameters 393 | ---------- 394 | transition : Callable[[Any,Any],[[Any,float], bool]]] 395 | transition must be a callable function that takes as arguments the 396 | (state, action) and returns (new_state, reward), end. 397 | random_state: Callable[[Any], Any] 398 | random state generator 399 | actions : Sequence[Any] 400 | Sequence of possible actions 401 | state_0 : Any, optional 402 | Initial state, by default None (random) 403 | action_0 : Any, optional 404 | Initial action, by default None (random) 405 | alpha : float, optional 406 | Learning rate, by default 0.1 407 | gamma : float, optional 408 | Discount factor, by default 0.9 409 | n_episodes : int, optional 410 | Number of episodes to simulate, by default 1E4 411 | max_steps : int, optional 412 | Maximum number of steps per episode, by default 1E3 413 | samples : int, optional 414 | Number of samples to take, by default 1000 415 | optimize : bool, optional 416 | Whether to optimize the policy or not, by default False 417 | policy : ModelFreePolicy, optional 418 | Policy to use, by default equal probability ModelFreePolicy 419 | tol : float, optional 420 | Tolerance for estimating convergence estimations 421 | eps : float, optional 422 | Epsilon value for the epsilon-soft policy, by default None (no exploration) 423 | 424 | Returns 425 | ------- 426 | vqpi : Tuple[VPi, QPi, Policy] 427 | Value function, action-value function, policy and samples if any. 428 | samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 429 | Samples taken during the simulation if any. The first element is the 430 | index of the iteration, the second is the value function, the third is 431 | the action-value function and the fourth is the TODO:. 432 | 433 | Raises 434 | ------ 435 | TransitionError: If any of the arguments is not of the correct type. 436 | ''' 437 | 438 | #policy = _set_policy(policy, eps, actions, approximator) 439 | 440 | _typecheck_all(transition=transition, 441 | constants=[gamma, alpha, n_episodes, max_steps, samples, tol], 442 | booleans=[optimize], policies=[policy]) 443 | 444 | _check_ranges(values=[gamma, alpha, n_episodes, max_steps, samples], 445 | ranges=[(0,1), (0,1), (1,np.inf), (1,np.inf), (1,1001), (1, np.inf)]) 446 | 447 | sample_step = _get_sample_step(samples, n_episodes) 448 | 449 | model = ModelFreeTL(transition, random_state, policy, gamma=gamma) 450 | v, q, samples = _lstd(model, state_0, action_0, 451 | alpha, int(n_episodes), int(max_steps), tol, optimize, sample_step) 452 | 453 | return AVQPi(v, q, policy), samples 454 | 455 | 456 | def _lstd(MF, s_0, a_0, alpha, n_episodes, max_steps, tol, optimize, sample_step): 457 | 458 | raise NotImplementedError 459 | 460 | 461 | def diff_semigradn(transition: Transition, 462 | random_state: Callable[[Any], Any], 463 | v_hat: SGDWA, 464 | q_hat: SGDWA=None, 465 | actions: Sequence[Any]=None, 466 | state_0: Any=None, 467 | action_0: Any=None, 468 | alpha: float=0.1, 469 | beta: float=0.1, 470 | n: int=1, 471 | T: int=1E5, 472 | samples: int=1000, 473 | optimize: bool=False, 474 | policy: ModelFreeTLPolicy=None, 475 | tol: float=TOL, 476 | eps: float=None) -> Tuple[AVQPi, Samples]: 477 | '''Differential semi gradient n-step Sarsa for estimation and control. 478 | 479 | The average reward setting is one of that comes to solve many problems 480 | related with discounted settings with function approximation. The average 481 | reward setting evaluates the quality of a policy by the average rate of reward. 482 | That is how good you expect the reward to be in average. 483 | 484 | Parameters 485 | ---------- 486 | transition : Callable[[Any,Any],[[Any,float], bool]]] 487 | transition must be a callable function that takes as arguments the 488 | (state, action) and returns (new_state, reward), end. 489 | random_state : Callable[[Any], Any] 490 | random state generator 491 | v_hat : SGDWA 492 | Function approximator to use for the state value function 493 | q_hat: SGDWA, optional 494 | Function approximator to use for the action-value function, by default None 495 | and will be replaced by a mocked version of q_hat where a one hot 496 | encoding is going to get appended to the state vector. 497 | actions: Sequence[Any] 498 | Sequence of possible actions 499 | state_0 : Any, optional 500 | Initial state, by default None (random) 501 | action_0 : Any, optional 502 | Initial action, by default None (random) 503 | alpha : float, optional 504 | Learning rate, by default 0.1 505 | beta : float, optional 506 | Step size for average reward updates, by default 0.1 507 | n : int, optional 508 | Number of steps to look ahead, by default 1 509 | T : int, optional 510 | Number of time steps to simulate, by default 1E5 511 | samples : int, optional 512 | Number of samples to take, by default 1000 513 | optimize : bool, optional 514 | Whether to optimize the policy or not, by default False 515 | policy : ModelFreePolicy, optional 516 | Policy to use, by default equal probability ModelFreePolicy 517 | tol : float, optional 518 | Tolerance for estimating convergence estimations 519 | eps : float, optional 520 | Epsilon value for the epsilon-soft policy, by default None (no exploration) 521 | 522 | Returns 523 | ------- 524 | vqpi : Tuple[VPi, QPi, Policy] 525 | Value function, action-value function, policy and samples if any. 526 | samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 527 | Samples taken during the simulation if any. The first element is the 528 | index of the iteration, the second is the value function, the third is 529 | the action-value function and the fourth is the Policy. 530 | 531 | Raises 532 | ------ 533 | TransitionError: If any of the arguments is not of the correct type. 534 | ''' 535 | policy = _set_policy(policy, eps, actions, v_hat, q_hat) 536 | 537 | _typecheck_all(transition=transition, 538 | constants=[alpha, beta, T, samples, tol], 539 | booleans=[optimize], policies=[policy]) 540 | 541 | _check_ranges(values=[alpha, beta, T, samples], 542 | ranges=[(0,1), (0,1), (1,np.inf), (1,1001)]) 543 | 544 | sample_step = _get_sample_step(samples, T) 545 | 546 | model = ModelFreeTL(transition, random_state, policy) 547 | vh, qh, samples = _diff_semigrad(model, v_hat, state_0, action_0, 548 | alpha, beta, n, int(T), tol, optimize, sample_step) 549 | 550 | return AVQPi(vh, qh, policy), samples 551 | 552 | 553 | def _diff_semigrad(MFS, v_hat, s_0, a_0, alpha, beta, n, T, tol, 554 | optimize, sample_step): 555 | ''' 556 | DRY but clear. Beta greek letter is written as 557 | ''' 558 | α, β, π = alpha, beta, MFS.policy 559 | q_hat = π.q_hat 560 | 561 | samples, dnorm = [], TOL*2 562 | s, a = _set_s0_a0(MFS, s_0, a_0) 563 | 564 | w_old = v_hat.w.copy() 565 | 566 | R, A, S, avg_R = [], [a], [s], 0 567 | for t in tqdm(range(T), desc=f'semigrad-TD', unit='episodes'): 568 | if dnorm < tol: 569 | break 570 | 571 | (s, r), end = MFS.step_transition(s, a) 572 | R.append(r) 573 | S.append(s) 574 | if end: 575 | break 576 | else: 577 | a = π(s) 578 | A.append(a) 579 | 580 | if t - n + 1 >= 0: 581 | rr = np.array(R) 582 | R_R = rr.sum() - avg_R*n 583 | δ_v = R_R + v_hat(S[n]) - v_hat(S[0]) 584 | δ_q = R_R + q_hat((S[n], A[n])) - q_hat((S[0], A[0])) 585 | 586 | avg_R = avg_R + β*δ_q 587 | 588 | s_t = S[0] 589 | a_t = A[0] 590 | 591 | v_hat.update(δ_v, α, s_t) 592 | if optimize: 593 | q_hat.update(δ_q, α, (s_t, a_t)) 594 | 595 | R.pop(0) 596 | A.pop(0) 597 | S.pop(0) 598 | 599 | dnorm = lnorm(w_old - v_hat.w) 600 | 601 | if t % sample_step == 0: 602 | samples.append(get_sample(v_hat, q_hat, π, t, optimize)) 603 | 604 | return v_hat, q_hat, samples 605 | 606 | 607 | def semigrad_td_lambda(transition: Transition, 608 | random_state: Callable, 609 | v_hat: SGDWA, 610 | q_hat: SGDWA=None, 611 | actions: Sequence[Any]=None, 612 | state_0: Any=None, 613 | action_0: Any=None, 614 | alpha: float=0.1, 615 | lambdaa: float=0.1, 616 | gamma: float=0.9, 617 | n_episodes: int=1E5, 618 | max_steps: int=1E3, 619 | samples: int=1000, 620 | optimize: bool=False, 621 | policy: ModelFreeTLPolicy=None, 622 | tol: float=TOL, 623 | eps: float=None) -> Tuple[AVQPi, Samples]: 624 | '''Semi-gradient TD(λ). 625 | 626 | Eligibility traces semi gradient TD(λ). This algorithms extends more 627 | generally to TD and MC. It also improves off-line λ-return algorithms following 628 | the forward view, alas backward view. It updates the weight vector on every step, 629 | improving sooner, and computations are equally distributed among the time steps. 630 | Also it can be applied to continuing problems rather than just episodic ones. 631 | 632 | Parameters 633 | ---------- 634 | transition : Callable[[Any,Any],[[Any,float], bool]]] 635 | transition must be a callable function that takes as arguments the 636 | (state, action) and returns (new_state, reward), end. 637 | random_state : Callable[[Any], Any] 638 | random state generator 639 | v_hat : SGDWA 640 | Function approximator to use for the state value function 641 | q_hat: SGDWA, optional 642 | Function approximator to use for the action-value function, by default None 643 | and will be replaced by a mocked version of q_hat where a one hot 644 | encoding is going to get appended to the state vector. 645 | actions: Sequence[Any] 646 | Sequence of possible actions 647 | state_0 : Any, optional 648 | Initial state, by default None (random) 649 | action_0 : Any, optional 650 | Initial action, by default None (random) 651 | alpha : float, optional 652 | Learning rate, by default 0.1 653 | lambdaa : float, optional 654 | Learning rate, by default 0.1 655 | gamma : float, optional 656 | Step size for average reward updates, by default 0.1 657 | n_episodes : int, optional 658 | Number of time steps to simulate, by default 1E5 659 | max_steps : int, optional 660 | Maximum number of steps per episode, by default 1000 661 | samples : int, optional 662 | Number of samples to take, by default 1000 663 | optimize : bool, optional 664 | Whether to optimize the policy or not, by default False 665 | policy : ModelFreePolicy, optional 666 | Policy to use, by default equal probability ModelFreePolicy 667 | tol : float, optional 668 | Tolerance for estimating convergence estimations 669 | eps : float, optional 670 | Epsilon value for the epsilon-soft policy, by default None (no exploration) 671 | 672 | Returns 673 | ------- 674 | vqpi : Tuple[VPi, QPi, Policy] 675 | Value function, action-value function, policy and samples if any. 676 | samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 677 | Samples taken during the simulation if any. The first element is the 678 | index of the iteration, the second is the value function, the third is 679 | the action-value function and the fourth is the Policy. 680 | 681 | Raises 682 | ------ 683 | TransitionError: If any of the arguments is not of the correct type. 684 | ''' 685 | policy = _set_policy(policy, eps, actions, v_hat, q_hat) 686 | 687 | _typecheck_all(transition=transition, 688 | constants=[alpha, gamma, lambdaa, n_episodes, samples, tol], 689 | booleans=[optimize], policies=[policy]) 690 | 691 | _check_ranges(values=[alpha, gamma, lambdaa, n_episodes, samples], 692 | ranges=[(0,1), (0,1), (0,1), (1,np.inf), (1,1001)]) 693 | 694 | sample_step = _get_sample_step(samples, T) 695 | 696 | model = ModelFreeTL(transition, random_state, policy) 697 | vh, qh, samples = _td_lambda(model, v_hat, state_0, action_0, alpha, 698 | lambdaa, int(n_episodes), int(max_steps), tol, optimize, sample_step) 699 | 700 | return AVQPi(vh, qh, policy), samples 701 | 702 | 703 | def _td_lambda(MFS, v_hat, s_0, a_0, alpha, lambdaa, n_episodes, max_steps, tol, 704 | sample_step, optimize): 705 | '''DRY but clear.''' 706 | α, γ, π, λ = alpha, MFS.gamma, MFS.policy, lambdaa 707 | q_hat = π.q_hat 708 | 709 | samples, dnorm = [], TOL*2 710 | for n_episode in tqdm(range(n_episodes), desc=f'semigrad-TD', unit='episodes'): 711 | if dnorm < tol: 712 | break 713 | s, a = _set_s0_a0(MFS, s_0, a_0) 714 | 715 | zv = np.zeros_like(v_hat.w) 716 | zq = np.zeros_like(q_hat.w) 717 | 718 | w_old = v_hat.w.copy() 719 | 720 | T = int(max_steps) 721 | for _ in range(T): 722 | (s_, r), end = MFS.step_transition(s, a) 723 | if end: 724 | break 725 | else: 726 | a = π(s) 727 | zv = γ*λ*zv + v_hat.grad(s) 728 | zq = γ*λ*zq + q_hat.grad((s, a)) 729 | Uv = r + γ*v_hat(s_) 730 | Uq = r + γ*q_hat(s_, a) 731 | 732 | v_hat.et_update(Uv, α, s, zv) 733 | 734 | if optimize: 735 | q_hat.et_update(Uq, α, (s, a), zq) 736 | 737 | s = s_ 738 | 739 | dnorm = lnorm(w_old - v_hat.w) 740 | 741 | if n_episode % sample_step == 0: 742 | samples.append(get_sample(v_hat, q_hat, π, n_episode, optimize)) 743 | n_episode += 1 744 | 745 | return v_hat, q_hat, samples 746 | 747 | 748 | def reinforce_mc(transition: Transition, 749 | random_state: Callable, 750 | pi_hat: Approximator, 751 | actions: Sequence[Any]=None, 752 | state_0: Any=None, 753 | action_0: Any=None, 754 | alpha: float=0.1, 755 | gamma: float=0.9, 756 | n_episodes: int=MAX_ITER, 757 | max_steps: int=MAX_STEPS, 758 | samples: int=1000, 759 | policy: REINFORCEPolicy=None, 760 | tol: float=TOL) -> Tuple[REINFORCEPolicy, List[REINFORCEPolicy]]: 761 | '''MC Policy-Gradient control algorithm 762 | 763 | This algorithm must be used with differentiable policies. Regardless of the 764 | approximator the parameter for the latter is optimized via SGD. For more 765 | information check p.328 chapter 13.3. 766 | 767 | Parameters 768 | ---------- 769 | transition : Callable[[Any,Any],[[Any,float], bool]]] 770 | transition must be a callable function that takes as arguments the 771 | (state, action) and returns (new_state, reward), end. 772 | random_state : Callable[[Any], Any] 773 | random state generator 774 | pi_hat : SGDWA 775 | Function approximator to use for the state value function. Wont be 776 | used even if specified if policy is provided 777 | actions: Sequence[Any] 778 | Sequence of possible actions 779 | state_0 : Any, optional 780 | Initial state, by default None (random) 781 | action_0 : Any, optional 782 | Initial action, by default None (random) 783 | alpha : float, optional 784 | Learning rate, by default 0.1 785 | gamma : float, optional 786 | Step size for average reward updates, by default 0.1 787 | n_episodes : int, optional 788 | Number of time steps to simulate, by default 1E5 789 | max_steps : int, optional 790 | Maximum number of steps per episode, by default 1000 791 | samples : int, optional 792 | Number of samples to take, by default 1000 793 | policy : ModelFreePolicy, optional 794 | Policy to use, by default equal probability ModelFreePolicy 795 | tol : float, optional 796 | Tolerance for estimating convergence estimations 797 | 798 | Returns 799 | ------- 800 | pi : REINFORCEPolicy 801 | Value function, action-value function, policy and samples if any. 802 | samples : List[REINFORCEPolicy] 803 | Samples taken during the simulation if any for the differentiable policy. 804 | 805 | Raises 806 | ------ 807 | TransitionError: If any of the arguments is not of the correct type. 808 | ''' 809 | policy = policy if policy else REINFORCEPolicy(actions, pi_hat) 810 | 811 | _typecheck_all(transition=transition, constants=[alpha, gamma, n_episodes, samples, tol], 812 | policies=[policy]) 813 | 814 | _check_ranges(values=[alpha, gamma, n_episodes, samples], 815 | ranges=[(0,1), (0,1), (1,np.inf), (1,1001)]) 816 | 817 | sample_step = _get_sample_step(samples, n_episodes) 818 | 819 | model = ModelFreeTL(transition, random_state, policy) 820 | pi, samples = _reinforce_mc(model, state_0, action_0, alpha, 821 | int(n_episodes), int(max_steps), tol, sample_step) 822 | 823 | return pi, samples 824 | 825 | 826 | def _reinforce_mc(MFS, s_0, a_0, alpha, n_episodes, max_steps, tol, sample_step): 827 | ''''not returning the usual sample set''' 828 | α, γ, π = alpha, MFS.gamma, MFS.policy 829 | gammatron = np.array([γ**i for i in range(max_steps)]) 830 | samples, dnorm = [], TOL*2 831 | for n_episode in tqdm(range(n_episodes), desc=f'MC Policy Gradient', unit='episodes'): 832 | s, a = _set_s0_a0(MFS, s_0, a_0) 833 | theta_old = deepcopy(π.pi_hat.w) 834 | episode = MFS.generate_episode(s, a, π, max_steps) 835 | rr = np.array([r for _, _, r in episode]) 836 | for t, (s, a, _) in enumerate(episode): 837 | G = gammatron[:len(episode)-t].dot(rr[t:]) 838 | c = α*G*(γ**t) 839 | π.update_policy(c, s, a) 840 | 841 | if n_episode % sample_step == 0: 842 | samples.append(deepcopy(π)) 843 | 844 | dnorm = lnorm(π.pi_hat.w - theta_old) 845 | if dnorm < tol: 846 | break 847 | 848 | return π, samples 849 | -------------------------------------------------------------------------------- /rl/solvers/model_based.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import numpy as np 4 | from numpy.linalg import norm as lnorm 5 | 6 | from rl.utils import ( 7 | Policy, 8 | _typecheck_all, 9 | _get_sample_step, 10 | _check_ranges, 11 | VQPi, 12 | Samples, 13 | Transition, 14 | Vpi, 15 | Qpi, 16 | PQueue, 17 | MAX_ITER, 18 | MAX_STEPS, 19 | TOL 20 | ) 21 | 22 | #TODO: refactor, docs 23 | 24 | def get_sample(MDP, v, q, π, n_iter): 25 | _idx = n_iter 26 | # TODO: refactor, there is no states tabular index here 27 | # and there is not stateaction 28 | _v, _q = Vpi(v.copy(), MDP.states), Qpi(q.copy(), MDP.stateaction) 29 | _pi = None 30 | return (_idx, _v, _q, _pi) 31 | 32 | 33 | # There are in-place methods optional. That is, there is actually an 34 | # inplace sweep of all states instantaneously, through the vectorization 35 | # of the update equations for DP methods. Should be Faster to execute and 36 | # slower to converge. But tests must be carried out to verify this claim. 37 | 38 | 39 | def vq_pi_iter_naive(MDP, policy: Policy, tol: float=TOL, inplace=False, 40 | max_iters: int=MAX_STEPS) -> Tuple[VQPi, Samples]: 41 | 42 | sample_step = _get_sample_step(samples, max_iters//10) # RULE OF THUMB 43 | 44 | v, q, samples = _vq_pi_iter_naive(MDP, policy, tol, max_iters, inplace, 45 | sample_step) 46 | 47 | return VQPi((v, q, policy)), samples 48 | 49 | 50 | def _inplace_step_pe(MDP, vᵢ, _, π_sa, r_sa, p_s, γ): 51 | for s in range(MDP.S): 52 | vᵢ[s] = np.dot(π_sa[s], r_sa[:,s]) 53 | vᵢ[s] += γ * np.dot(p_s[s] @ vᵢ, π_sa[s]) 54 | return vᵢ 55 | 56 | 57 | def _naive_step_pe(_, vᵢ, vᵢ_1, π_sa, r_sa, p_s, γ): 58 | vᵢ = np.diag(π_sa @ r_sa) 59 | vᵢ = vᵢ + γ * np.diag((p_s @ vᵢ_1) @ π_sa.T) 60 | return vᵢ 61 | 62 | 63 | # pe: policy evaluation 64 | ITER_NAIVE_STEP_MAP = { 65 | 'inplace': _inplace_step_pe, 66 | 'naive': _naive_step_pe 67 | } 68 | 69 | 70 | def _vq_pi_iter_naive(MDP, policy, tol, max_iters, inplace, sample_step): 71 | γ = MDP.gamma 72 | p_s = MDP.p_s 73 | 74 | vᵢ = np.ones(MDP.S) 75 | diff_norm = TOL*2 76 | 77 | update_step = ITER_NAIVE_STEP_MAP['inplace' if inplace else 'naive'] 78 | 79 | π_sa = np.array([policy.π(s) for s in range(MDP.S)]) #SxA 80 | r_sa = np.array([[MDP.r_sa(s,a) for s in range(MDP.S)] 81 | for a in range(MDP.A)]) #AxS 82 | 83 | n_iter, samples = 0, [] 84 | while (n_iter < max_iters) and (diff_norm > tol): 85 | vᵢ_1 = vᵢ.copy() 86 | vᵢ = update_step(MDP, vᵢ, vᵢ_1, π_sa, r_sa, p_s, γ) 87 | diff_norm = lnorm(vᵢ - vᵢ_1) 88 | n_iter += 1 89 | 90 | if n_iter % sample_step == 0: 91 | samples.append(get_sample(MDP, vᵢ, None, policy, n_iter)) 92 | 93 | vπ = vᵢ 94 | qπ = r_sa + (p_s @ vπ).T 95 | 96 | return vπ, qπ 97 | 98 | 99 | def policy_iteration(MDP, policy: Policy, tol_eval: float = TOL, 100 | max_iters_eval: int = MAX_ITER, tol_opt: float = TOL, 101 | max_iters_opt: int = MAX_ITER, samples: int=1000 102 | ) -> Tuple[VQPi, Samples]: 103 | 104 | vᵢ_1, q_i_1 = vq_pi_iter_naive(MDP, policy, tol_eval, max_iters_eval) 105 | vᵢ, q_i = vᵢ_1.copy(), q_i_1.copy() 106 | 107 | diff_norm = 2*tol_opt 108 | 109 | n_iter = 0 110 | 111 | while (n_iter < max_iters_opt) and (diff_norm > tol_opt): 112 | vᵢ_1 = vᵢ.copy() 113 | q_i_1 = q_i.copy() 114 | 115 | policy.update_policy(q_i_1) 116 | vᵢ, q_i = vq_pi_iter_naive(MDP, policy, tol_eval, max_iters_eval) 117 | 118 | n_iter += 1 119 | diff_norm = lnorm(vᵢ - vᵢ_1) 120 | 121 | return vᵢ, q_i, samples 122 | 123 | 124 | def _inplace_step_vi(MDP, vᵢ, _, r_sa, p_s, γ): 125 | for s in range(MDP.S): 126 | vᵢ[s] = np.max(r_sa[:,s] + γ * (p_s[s] @ vᵢ)) 127 | return vᵢ, None 128 | 129 | 130 | def _naive_step_vi(_, vᵢ, vᵢ_1, r_sa, p_s, γ): 131 | qᵢ = r_sa + γ * (p_s @ vᵢ_1).T 132 | vᵢ = np.max(qᵢ, axis=0) 133 | return vᵢ, qᵢ 134 | 135 | 136 | VALUE_ITERATION_STEP_MAP = { 137 | 'inplace': _inplace_step_vi, 138 | 'naive': _naive_step_vi 139 | } 140 | 141 | 142 | def value_iteration(MDP, policy: Policy = None, inplace: bool=False, 143 | tol: float = TOL, max_iters: int=MAX_ITER) -> Tuple[VQPi, Samples]: 144 | 145 | sample_step = _get_sample_step(samples, max_iters//10) # RULE OF THUMB 146 | 147 | v, q, samples = _value_iteration(MDP, policy, tol, max_iters, inplace, 148 | sample_step) 149 | 150 | return VQPi((v, q, policy)), samples 151 | 152 | 153 | def _value_iteration(MDP, policy, tol, max_iters, inplace, sample_step): 154 | policy = policy if policy else MDP.policy 155 | 156 | γ = MDP.gamma 157 | p_s = MDP.p_s 158 | 159 | vᵢ = np.ones(MDP.S) 160 | diff_norm = TOL*2 161 | 162 | update_step = VALUE_ITERATION_STEP_MAP['inplace' if inplace else 'naive'] 163 | 164 | r_sa = np.array([[MDP.r_sa(s,a) for s in range(MDP.S)] 165 | for a in range(MDP.A)]) #AxS 166 | 167 | n_iter, samples = 0, [] 168 | while (n_iter < max_iters) and (diff_norm > tol): 169 | vᵢ_1 = vᵢ.copy() 170 | vᵢ, qᵢ = update_step(MDP, vᵢ, vᵢ_1, r_sa, p_s, γ) 171 | diff_norm = lnorm(vᵢ - vᵢ_1) 172 | n_iter += 1 173 | 174 | if n_iter % sample_step == 0: 175 | samples.append(get_sample(MDP, vᵢ, qᵢ, policy, n_iter)) 176 | 177 | policy.update_policy(qᵢ) -------------------------------------------------------------------------------- /rl/solvers/model_free.py: -------------------------------------------------------------------------------- 1 | """ 2 | RL - Copyright © 2023 Iván Belenky @Leculette 3 | """ 4 | 5 | from typing import ( 6 | Tuple, 7 | Sequence, 8 | Any 9 | ) 10 | 11 | from tqdm import tqdm 12 | import numpy as np 13 | from numpy.linalg import norm as lnorm 14 | 15 | from rl.model_free import ( 16 | ModelFree, 17 | ModelFreePolicy, 18 | EpsilonSoftPolicy 19 | ) 20 | from rl.utils import ( 21 | Policy, 22 | _typecheck_all, 23 | _get_sample_step, 24 | _check_ranges, 25 | VQPi, 26 | Samples, 27 | Transition, 28 | Vpi, 29 | Qpi, 30 | PQueue, 31 | MAX_ITER, 32 | MAX_STEPS, 33 | TOL 34 | ) 35 | 36 | 37 | def get_sample(MF, v, q, π, n_episode, optimize): 38 | _idx = n_episode 39 | _v, _q = Vpi(v.copy(), MF.states), Qpi(q.copy(), MF.stateaction) 40 | _pi = None 41 | if optimize: 42 | _pi = ModelFreePolicy(MF.actions.N, MF.states.N) 43 | _pi.pi = π.pi.copy() 44 | return (_idx, _v, _q, _pi) 45 | 46 | 47 | def _set_s0_a0(MF, s, a): 48 | s_0, a_0 = MF.random_sa() 49 | s_0 = s_0 if not s else s 50 | a_0 = a_0 if not a else a 51 | return s_0, a_0 52 | 53 | 54 | def _set_policy(policy, eps, actions, states): 55 | if not policy and eps: 56 | _typecheck_all(constants=[eps]) 57 | _check_ranges(values=[eps], ranges=[(0,1)]) 58 | policy = EpsilonSoftPolicy(actions, states, eps=eps) 59 | elif not policy: 60 | policy = ModelFreePolicy(actions, states) 61 | 62 | return policy 63 | 64 | 65 | def alpha_mc(states: Sequence[Any], actions: Sequence[Any], transition: Transition, 66 | gamma: float=0.9, alpha: float=0.05, use_N :bool=False, first_visit: bool=True, 67 | exploring_starts: bool=True, n_episodes: int=MAX_ITER, max_steps: int=MAX_STEPS, 68 | samples: int=1000, optimize: bool=False, policy: ModelFreePolicy=None, 69 | eps: float=None) -> Tuple[VQPi, Samples]: 70 | '''α-MC state and action-value function estimation, policy optimization 71 | 72 | Alpha weighted Monte Carlo state and action-value function estimation, policy 73 | optimization. By setting use_N to True, it will use the classical weighting 74 | schema, utilizing N(s) instead of a contstant α. 75 | 76 | Parameters 77 | ---------- 78 | states : Sequence[Any] 79 | actions : Sequence[Any] 80 | transition : Callable[[Any,Any],[[Any,float], bool]]] 81 | transition must be a callable function that takes as arguments the 82 | (state, action) and returns (new_state, reward), end. 83 | gamma : float, optional 84 | Discount factor, by default 0.9 85 | alpha : float, optional 86 | Learning rate, by default 0.1 87 | use_N : bool, optional 88 | If true, it will use 1/N(s) (number of visits) instead of α 89 | first_visit : bool, optional 90 | If true, it will only use the first visit to a state, by default True 91 | exploring_starts : bool, optional 92 | Random action at the start of each episode. 93 | n_episodes : int, optional 94 | Number of episodes to simulate, by default 1E4 95 | max_steps : int, optional 96 | Maximum number of steps per episode, by default 1E3 97 | samples : int, optional 98 | Number of samples to take, by default 1000 99 | optimize : bool, optional 100 | Whether to optimize the policy or not, by default False 101 | policy : ModelFreePolicy, optional 102 | Policy to use, by default equal probability ModelFreePolicy 103 | eps : float, optional 104 | Epsilon for the EpsilonSoftPolicy, by default None (no exploration) 105 | 106 | Returns 107 | ------- 108 | vqpi : Tuple[VPi, QPi, Policy] 109 | Value function, action-value function, policy and samples if any. 110 | samples : Tuple[int, Vpi, Qpi, Policy] 111 | Samples taken during the simulation if any. The first element is the 112 | index of the iteration, the second is the value function, the third is 113 | the action-value function and the fourth is the policy until 114 | optimization point idx. 115 | 116 | Raises 117 | ------ 118 | TransitionException: transition calls function checks. 119 | ''' 120 | policy = _set_policy(policy, eps, actions, states) 121 | 122 | _typecheck_all(tabular_idxs=[states, actions],transition=transition, 123 | constants=[gamma, alpha, n_episodes, max_steps, samples], 124 | booleans=[use_N, first_visit, exploring_starts, optimize], 125 | policies=[policy]) 126 | 127 | _check_ranges(values=[gamma, alpha, n_episodes, max_steps, samples], 128 | ranges=[(0,1), (0,1), (1,np.inf), (1,np.inf), (1,1001)]) 129 | 130 | sample_step = _get_sample_step(samples, n_episodes) 131 | 132 | model = ModelFree(states, actions, transition, gamma=gamma, policy=policy) 133 | v, q, samples = _visit_monte_carlo(model, first_visit, exploring_starts, use_N, 134 | alpha, int(n_episodes), max_steps, optimize, sample_step) 135 | 136 | return VQPi((v, q, model.policy.pi)), samples 137 | 138 | 139 | def _mc_step(v, q, t, s_t, a_t, s, a, n_s, n_sa, G, first_visit): 140 | if s_t not in s[:-(t+1)] or not first_visit: 141 | n_s[s_t] = n_s[s_t] + 1 142 | v[s_t] = v[s_t] + (G - v[s_t])/n_s[s_t] 143 | 144 | q_key = (s_t, a_t) 145 | if q_key not in zip(s[:-(t+1)],a[:-(t+1)]) or not first_visit: 146 | n_sa[q_key] = n_sa[q_key] + 1 147 | q[q_key] = q[q_key] + (G - q[q_key])/n_sa[q_key] 148 | return True 149 | 150 | return False 151 | 152 | 153 | def _mc_step_α(v, q, t, s_t, a_t, s, a, α, G, first_visit): 154 | if s_t not in s[:-(t+1)] or not first_visit: 155 | v[s_t] = v[s_t] + α*(G - v[s_t]) 156 | 157 | q_key = (s_t, a_t) 158 | if q_key not in zip(s[:-(t+1)],a[:-(t+1)]) or not first_visit: 159 | q[q_key] = q[q_key] + α*(G - q[q_key]) 160 | return True 161 | 162 | return False 163 | 164 | 165 | def _visit_monte_carlo(MF, first_visit, exploring_starts, use_N, alpha, 166 | n_episodes, max_steps, optimize, sample_step): 167 | 168 | π = MF.policy 169 | γ = MF.gamma 170 | α = alpha 171 | 172 | samples = [] 173 | 174 | v, q = np.zeros(MF.states.N), np.zeros((MF.states.N, MF.actions.N)) 175 | if use_N: 176 | n_s, n_sa = np.zeros(MF.states.N), np.zeros((MF.states.N, MF.actions.N)) 177 | 178 | s_0, a_0 = MF.random_sa(value=True) 179 | 180 | for n_episode in tqdm(range(n_episodes), desc='Monte Carlo', unit='episodes'): 181 | if exploring_starts: 182 | s_0, a_0 = MF.random_sa(value=True) 183 | 184 | episode = MF.generate_episode(s_0, a_0, π, max_steps) 185 | sar = np.array(episode) 186 | s, a, _ = sar.T 187 | 188 | G = 0 189 | for t, (s_t, a_t, r_tt) in enumerate(sar[::-1]): 190 | s_t, a_t = int(s_t), int(a_t) 191 | G = γ*G + r_tt 192 | if use_N: 193 | update = _mc_step(v, q, t, s_t, a_t, s, a, n_s, 194 | n_sa, G, first_visit) 195 | else: 196 | update = _mc_step_α(v, q, t, s_t, a_t, s, a, 197 | α, G, first_visit) 198 | if optimize and update: 199 | π.update_policy(q, s_t) 200 | 201 | if sample_step and n_episode % sample_step == 0: 202 | samples.append(get_sample(MF, v, q, π, n_episode, optimize)) 203 | 204 | return v, q, samples 205 | 206 | 207 | def off_policy_mc(states: Sequence[Any], actions: Sequence[Any], transition: Transition, 208 | gamma: float=0.9, first_visit: bool=True, ordinary: bool=False, 209 | n_episodes: int=MAX_ITER, max_steps: int=MAX_STEPS, samples: int=1000, 210 | optimize: bool=False, policy: ModelFreePolicy=None, eps: float=None, 211 | b: ModelFreePolicy=None) -> Tuple[VQPi, Samples]: 212 | '''Off-policy Monte Carlo state and action value function estimation, policy 213 | 214 | Off policy Monte Carlo method for estimating state and action-value functtions 215 | as well as optimizing policies. If no behavior policy is provided an 216 | equal probability one for each (s,a) pair will be used. In order to guarantee 217 | convergence you must specify 218 | 219 | Parameters 220 | ---------- 221 | states : Sequence[Any] 222 | actions : Sequence[Any] 223 | transition : Callable[[Any,Any],[[Any,float], bool]]] 224 | transition must be a callable function that takes as arguments the 225 | (state, action) and returns (new_state, reward), end. 226 | gamma : float, optional 227 | Discount factor, by default 0.9 228 | first_visit : bool, optional 229 | If true, it will only use the first visit to a state, by default True 230 | ordinary : bool, optional 231 | ordinary sampling, beware! high variance, by default False 232 | n_episodes : int, optional 233 | Number of episodes to simulate, by default 1E4 234 | max_steps : int, optional 235 | Maximum number of steps per episode, by default 1E3 236 | samples : int, optional 237 | Number of samples to take, by default 1000 238 | optimize : bool, optional 239 | Whether to optimize the policy or not, by default False 240 | policy : ModelFreePolicy, optional 241 | Policy to use, by default equal probability ModelFreePolicy 242 | eps : float, optional 243 | Epsilon for the EpsilonSoftPolicy, by default None (no exploration) 244 | b : ModelFreePolicy, optional 245 | Behavior policy, by default None (equal probability ModelFreePolicy) 246 | 247 | Returns 248 | ------- 249 | vqpi : Tuple[VPi, QPi, Policy] 250 | Value function, action-value function, policy and samples if any. 251 | samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 252 | Samples taken during the simulation if any. The first element is the 253 | index of the iteration, the second is the value function, the third is 254 | the action-value function and the fourth is the TODO:. 255 | 256 | Raises 257 | ------ 258 | TransitionException: transition calls function checks. 259 | ''' 260 | 261 | policy = _set_policy(policy, eps, actions, states) 262 | if not b: 263 | b = ModelFreePolicy(actions, states) 264 | 265 | _typecheck_all(tabular_idxs=[states, actions],transition=transition, 266 | constants=[gamma, n_episodes, max_steps, samples], 267 | booleans=[first_visit, optimize], 268 | policies=[policy, b]) 269 | _check_ranges(values=[gamma, n_episodes, max_steps, samples], 270 | ranges=[(0,1), (1,np.inf), (1,np.inf), (1,1001)]) 271 | 272 | sample_step = _get_sample_step(samples, n_episodes) 273 | 274 | model = ModelFree(states, actions, transition, gamma=gamma, policy=policy) 275 | v, q, samples = _off_policy_monte_carlo(model, b, int(n_episodes), 276 | max_steps, first_visit, ordinary, optimize, sample_step) 277 | 278 | return VQPi((v, q, policy)), samples 279 | 280 | 281 | def _mc_step_off(q, v, t, s_t, a_t, s, a, G, w, c, c_q, 282 | first_visit, ordinary): 283 | 284 | c_add = 1 if ordinary else w 285 | denom = w if ordinary else 1 286 | 287 | if s_t not in s[:-(t+1)] or not first_visit: 288 | c[s_t] = c[s_t] + c_add 289 | if w < 1E-10: 290 | if ordinary: 291 | v[s_t] = v[s_t] - 1/c[s_t] * v[s_t] 292 | else: 293 | v[s_t] = v[s_t] + w/c[s_t] * (G - v[s_t]/denom) 294 | 295 | q_key = (s_t, a_t) 296 | if q_key not in zip(s[:-(t+1)],a[:-(t+1)]) or not first_visit: 297 | c_q[q_key] = c_q[q_key] + c_add 298 | if w < 1E-10: 299 | if ordinary: 300 | q[q_key] = q[q_key] - 1/c_q[q_key] * q[q_key] 301 | else: 302 | q[q_key] = q[q_key] + w/c_q[q_key] * (G - q[q_key]/denom) 303 | return True 304 | 305 | return False 306 | 307 | 308 | def _off_policy_monte_carlo(MF, off_policy, n_episodes, max_steps, first_visit, 309 | ordinary, optimize, sample_step): 310 | 311 | γ = MF.gamma 312 | b = off_policy 313 | π = MF.policy 314 | 315 | samples = [] 316 | 317 | v, q = np.zeros(MF.states.N), np.zeros((MF.states.N, MF.actions.N)) 318 | c, c_q = np.zeros(MF.states.N), np.zeros((MF.states.N, MF.actions.N)) 319 | 320 | for n_episode in tqdm(range(int(n_episodes)), desc='Off-policy MC', unit='episodes'): 321 | G = 0. 322 | s_0, a_0 = MF.random_sa(value=True) 323 | episode = MF.generate_episode(s_0, a_0, b, max_steps) 324 | sar = np.array(episode) 325 | s, a, _ = sar.T 326 | 327 | w = 1. 328 | for t, (s_t, a_t, r_tt) in enumerate(sar[::-1]): 329 | if w < 1E-10: 330 | break 331 | 332 | s_t, a_t = int(s_t), int(a_t) 333 | 334 | rho = π.pi_as(a_t, s_t)/b.pi_as(a_t, s_t) 335 | w = w*rho 336 | 337 | G = γ*G + r_tt 338 | update = _mc_step_off(q, v, t, s_t, a_t, s, a, 339 | G, w, c, c_q, first_visit, ordinary) 340 | 341 | if update and optimize: 342 | π.update_policy(q, s_t) 343 | 344 | if sample_step and n_episode % sample_step == 0: 345 | samples.append(get_sample(MF, v, q, π, n_episode, optimize)) 346 | 347 | return v, q, samples 348 | 349 | 350 | 351 | def tdn(states: Sequence[Any], actions: Sequence[Any], transition: Transition, 352 | state_0: Any=None, action_0: Any=None, gamma: float=0.9, n: int=1, 353 | alpha: float=0.05, n_episodes: int=MAX_ITER, policy: ModelFreePolicy=None, 354 | eps: float=None, optimize: bool=False, method: str='sarsa', samples: int=1000, 355 | max_steps: int=MAX_STEPS) -> Tuple[VQPi, Samples]: 356 | '''N-temporal differences algorithm. 357 | 358 | Temporal differences algorithm for estimating the value function of a 359 | policy, improve it and analyze it. 360 | 361 | Parameters 362 | ---------- 363 | states : Sequence[Any] 364 | actions : Sequence[Any] 365 | transition : Callable[[Any,Any],[[Any,float], bool]]] 366 | transition must be a callable function that takes as arguments the 367 | (state, action) and returns (new_state, reward), end. 368 | state_0 : Any, optional 369 | Initial state, by default None (random) 370 | action_0 : Any, optional 371 | Initial action, by default None (random) 372 | gamma : float, optional 373 | Discount factor, by default 0.9 374 | n : int, optional 375 | Number of steps to look ahead, by default 1 376 | alpha : float, optional 377 | Learning rate, by default 0.1 378 | n_episodes : int, optional 379 | Number of episodes to simulate, by default 1E4 380 | max_steps : int, optional 381 | Maximum number of steps per episode, by default 1E3 382 | policy : ModelFreePolicy, optional 383 | Policy to use, by default equal probability ModelFreePolicy 384 | eps : float, optional 385 | Epsilon value for the epsilon-soft policy, by default None (no exploration) 386 | optimize : bool, optional 387 | Whether to optimize the policy or not, by default False 388 | samples : int, optional 389 | Number of samples to take, by default 1000 390 | 391 | Returns 392 | ------- 393 | vqpi : Tuple[VPi, QPi, Policy] 394 | Value function, action-value function, policy and samples if any. 395 | samples : Tuple[int, List[Vpi], List[Qpi], List[np.ndarray]] 396 | Samples taken during the simulation if any. The first element is the 397 | index of the iteration, the second is the value function, the third is 398 | the action-value function and the fourth is the TODO:. 399 | 400 | Raises 401 | ------ 402 | TypeError: If any of the arguments is not of the correct type. 403 | 404 | Examples 405 | -------- 406 | Define state action pairs 407 | >>> from rl import tdn 408 | >>> states = [0] 409 | >>> actions = ['left', 'right'] 410 | Define the transition method, taking (state, action) 411 | and returning (new_state, reward), end 412 | >>> def state_transition(state, action): 413 | >>> if action == 'right': 414 | >>> return (state, 0), True 415 | >>> if action == 'left': 416 | >>> threshold = np.random.random() 417 | >>> if threshold > 0.9: 418 | >>> return (state, 1), True 419 | >>> else: 420 | >>> return (state, 0), False 421 | Solve! 422 | >>> tdn(states, actions, state_transition, gamma=1, n=3, alpha=0.05) 423 | (array([0.134]), array([[0.513., 0.]]), , None) 424 | ''' 425 | policy = _set_policy(policy, eps, actions, states) 426 | 427 | if method not in METHODS: 428 | raise ValueError( 429 | f'Unknown method {method}\n' 430 | 'Available methods are (sarsa, sarsa_on, qlearning, expected_sarsa' 431 | ', dqlearning)') 432 | 433 | _typecheck_all(tabular_idxs=[states,actions], transition=transition, 434 | constants=[gamma, n, alpha, n_episodes, samples, max_steps], 435 | booleans=[optimize], policies=[policy]) 436 | 437 | sample_step = _get_sample_step(samples, n_episodes) 438 | 439 | model = ModelFree(states, actions, transition, gamma=gamma, policy=policy) 440 | 441 | _tdn = METHOD_MAP[method] 442 | 443 | v, q, samples = _tdn(model, state_0, action_0, n, alpha, int(n_episodes), 444 | max_steps, optimize, method, sample_step) 445 | 446 | return VQPi((v, q, policy)), samples 447 | 448 | 449 | def _td_step(s, a, r, t, T, n, v, q, γ, α, gammatron, π=None): 450 | '''td step update''' 451 | s_t, a_t, rr = s[t], a[t], r[t:t+n] 452 | G = np.dot(gammatron[:rr.shape[0]], rr) 453 | G_v, G_q = G, G 454 | if t + n < T: 455 | G_v = G_v + (γ**n) * v[s[t+n]] 456 | G_q = G_q + (γ**n) * q[s[t+n], a[t+n]] 457 | 458 | v[s_t] = v[s_t] + α * (G_v - v[s_t]) 459 | q_key = (s_t, a_t) 460 | q[q_key] = q[q_key] + α * (G_q - q[q_key]) 461 | 462 | 463 | def _td_qlearning(s, a, r, t, T, n, v, q, γ, α, gammatron, π=None): 464 | '''td qlearning update''' 465 | s_t, a_t, rr = s[t], a[t], r[t:t+n] 466 | G = np.dot(gammatron[:rr.shape[0]], rr) 467 | if t + n < T: 468 | G = G + (γ**n) * np.max(q[s[t+n]]) 469 | 470 | v[s_t] = v[s_t] + α * (G - v[s_t]) 471 | q_key = (s_t, a_t) 472 | q[q_key] = q[q_key] + α * (G - q[q_key]) 473 | 474 | 475 | def _td_expected_sarsa(s, a, r, t, T, n, v, q, γ, α, gammatron, π=None): 476 | s_t, a_t, rr = s[t], a[t], r[t:t+n] 477 | G = np.dot(gammatron[:rr.shape[0]], rr) 478 | if t + n < T: 479 | G = G + (γ**n) * np.dot(π.pi[s[t+n]], q[s[t+n]]) 480 | 481 | v[s_t] = v[s_t] + α * (G - v[s_t]) 482 | q_key = (s_t, a_t) 483 | q[q_key] = q[q_key] + α * (G - q[q_key]) 484 | 485 | 486 | STEP_MAP = { 487 | 'sarsa': _td_step, 488 | 'qlearning': _td_qlearning, 489 | 'expected_sarsa': _td_expected_sarsa, 490 | } 491 | 492 | 493 | def _tdn_onoff(MF, s_0, a_0, n, alpha, n_episodes, max_steps, optimize, 494 | method, sample_step): 495 | '''N-temporal differences algorithm. 496 | 497 | This is the basic implementation of the N-temporal difference algorithm. 498 | When optimizing the policy, the method for updating will be quasi-off 499 | policy. That is the updates are taking place with respect to the q-values 500 | updated on each step, but each step corresponds to the old policy. This 501 | implies that at the beginning of the updates are strictly on policy, and 502 | at the end, when probably all the states have been visited, the updates 503 | are off policy. 504 | ''' 505 | π = MF.policy 506 | α = alpha 507 | γ = MF.gamma 508 | gammatron = np.array([γ**i for i in range(n)]) 509 | v, q = MF.init_vq() 510 | 511 | 512 | f_step = STEP_MAP[method] 513 | 514 | samples = [] 515 | for n_episode in tqdm(range(n_episodes), desc=f'td({n-1})', unit='episode'): 516 | if not s_0: 517 | s_0, _ = MF.random_sa(value=True) 518 | if not a_0: 519 | _, a_0 = MF.random_sa(value=True) 520 | episode = MF.generate_episode(s_0, a_0, π, max_steps) 521 | 522 | sar = np.array(episode) 523 | s, a, r = sar[:,0], sar[:,1], sar[:,2] 524 | 525 | s = s.astype(int) 526 | a = a.astype(int) 527 | 528 | T = s.shape[0] 529 | for t in range(T): 530 | f_step(s, a, r, t, T, n, v, q, γ, α, gammatron, π) 531 | # episode is already set so next step is not generated 532 | # via a greedy strategy, each episode generation is greedy 533 | if optimize: 534 | # in/out-place update for current and next episode 535 | # off policy without importance weighting 536 | π.update_policy(q, s[t]) 537 | 538 | if sample_step and n_episode % sample_step == 0: 539 | samples.append(get_sample(MF, v, q, π, n_episode, optimize)) 540 | 541 | return v, q, samples 542 | 543 | 544 | def _td_dq_step(s, a, r, t, T, n, v1, q1, v2, q2, γ, α, gammatron, π): 545 | '''td step update''' 546 | s_t, a_t, rr = s[t], a[t], r[t:t+n] 547 | G = np.dot(gammatron[:rr.shape[0]], rr) 548 | G_v, G_q = G, G 549 | if t + n < T: 550 | G_v = G_v + (γ**n) * v2[s[t+n]] 551 | G_q = G_q + (γ**n) * q2[s[t+n], np.argmax(q1[s[t+n]])] 552 | 553 | v1[s_t] = v1[s_t] + α * (G_v - v1[s_t]) 554 | q_key = (s_t, a_t) 555 | q1[q_key] = q1[q_key] + α * (G_q - q1[q_key]) 556 | 557 | 558 | def _double_q(MF, s_0, a_0, n, alpha, n_episodes, max_steps, optimize, 559 | method, sample_step): 560 | 561 | π, α, γ = MF.policy, alpha, MF.gamma 562 | gammatron = np.array([γ**i for i in range(n)]) 563 | 564 | v1, q1 = MF.init_vq() 565 | v2, q2 = MF.init_vq() 566 | v, q = MF.init_vq() 567 | 568 | samples = [] 569 | for n_episode in tqdm(range(n_episodes)): 570 | s_0, a_0 = _set_s0_a0(MF, s_0, a_0) 571 | episode = MF.generate_episode(s_0, a_0, π, max_steps) 572 | 573 | sar = np.array(episode) 574 | s, a, r = sar[:,0], sar[:,1], sar[:,2] 575 | 576 | s = s.astype(int) 577 | a = a.astype(int) 578 | 579 | T = s.shape[0] 580 | for t in range(T): 581 | if np.random.rand() < 0.5: 582 | _td_dq_step(s, a, r, t, T, n, v1, q1, v2, q2, γ, α, gammatron, π) 583 | else: 584 | _td_dq_step(s, a, r, t, T, n, v2, q2, v1, q1, γ, α, gammatron, π) 585 | 586 | v = (v1 + v2)/2 587 | q = (q1 + q2)/2 588 | 589 | if optimize: 590 | π.update_policy(q, s[t]) 591 | 592 | if sample_step and n_episode % sample_step == 0: 593 | samples.append(get_sample(MF, v, q, π, n_episode, optimize)) 594 | 595 | return v, q, samples 596 | 597 | 598 | def _tdn_on(MF, s_0, a_0, n, alpha, n_episodes, max_steps, optimize, 599 | method, sample_step): 600 | '''N-temporal differences algorithm for learning. 601 | 602 | Super slow and inefficient, but readable and replicated exactly 603 | from sutton's n-step SARSA 604 | ''' 605 | π, α, γ = MF.policy, alpha, MF.gamma 606 | gammatron = np.array([γ**i for i in range(n)]) 607 | 608 | v, q = MF.init_vq() 609 | 610 | samples = [] 611 | for n_episode in tqdm(range(n_episodes), desc=f'td({n-1}) variant', unit='episode'): 612 | s_0, a_0 = _set_s0_a0(MF, s_0, a_0) 613 | 614 | s = MF.states.get_index(s_0) 615 | a = MF.actions.get_index(a_0) 616 | T = int(max_steps) 617 | R, A, S, G = [], [a], [s], 0 618 | for t in range(T): 619 | if t < T: 620 | (s, r), end = MF.step_transition(s, a) 621 | R.append(r) 622 | S.append(s) 623 | if end: 624 | T = t + 1 625 | else: 626 | a = π(s) 627 | A.append(a) 628 | 629 | tau = t - n + 1 630 | if tau >= 0: 631 | rr = np.array(R[tau:min(tau+n, T)]) 632 | G = gammatron[:rr.shape[0]].dot(rr) 633 | G_v, G_q = G, G 634 | if tau + n < T: 635 | G_v = G_v + γ**n * v[S[tau+n]] 636 | G_q = G_q + γ**n * q[S[tau+n], A[tau+n]] 637 | 638 | s_t = S[tau] 639 | a_t = A[tau] 640 | v[s_t] = v[s_t] + α * (G_v - v[s_t]) 641 | q[(s_t, a_t)] = q[(s_t, a_t)] + α * (G_q - q[(s_t, a_t)]) 642 | 643 | π.update_policy(q, s_t) 644 | 645 | if tau == T - 1: 646 | break 647 | 648 | if n_episode % sample_step == 0: 649 | samples.append(get_sample(MF, v, q, π, n_episode, optimize)) 650 | 651 | return v, q, samples 652 | 653 | 654 | METHOD_MAP = { 655 | 'sarsa_on': _tdn_on, 656 | 'sarsa': _tdn_onoff, 657 | 'qlearning': _tdn_onoff, 658 | 'expected_sarsa': _tdn_onoff, 659 | 'dqlearning': _double_q 660 | } 661 | 662 | 663 | METHODS = METHOD_MAP.keys() 664 | 665 | 666 | def n_tree_backup(states: Sequence[Any], actions: Sequence[Any], transition: Transition, 667 | state_0: Any=None, action_0: Any=None, gamma: float=1.0, n: int=1, 668 | alpha: float=0.05, n_episodes: int=MAX_ITER, policy: ModelFreePolicy=None, 669 | eps: float=None, optimize: bool=False, samples: int=1000, max_steps: int=MAX_STEPS 670 | ) -> Tuple[VQPi, Samples]: 671 | '''N-temporal differences algorithm. 672 | 673 | Temporal differences algorithm for estimating the value function of a 674 | policy, improve it and analyze it. 675 | 676 | Parameters 677 | ---------- 678 | states : Sequence[Any] 679 | actions : Sequence[Any] 680 | state_0 : Any, optional 681 | Initial state, by default None (random) 682 | action_0 : Any, optional 683 | Initial action, by default None (random) 684 | transition : Callable[[Any,Any],[[Any,float], bool]]] 685 | transition must be a callable function that takes as arguments the 686 | (state, action) and returns (new_state, reward), end. 687 | gamma : float, optional 688 | Discount factor, by default 0.9 689 | n : int, optional 690 | Number of steps to look ahead, by default 1 691 | alpha : float, optional 692 | Learning rate, by default 0.1 693 | n_episodes : int, optional 694 | Number of episodes to simulate, by default 1E4 695 | max_steps : int, optional 696 | Maximum number of steps per episode, by default 1E3 697 | policy : ModelFreePolicy, optional 698 | Policy to use, by default equal probability ModelFreePolicy 699 | eps : float, optional 700 | Epsilon value for the epsilon-soft policy, by default None (no exploration) 701 | optimize : bool, optional 702 | Whether to optimize the policy or not, by default False 703 | samples : int, optional 704 | Number of samples to take, by default 1000 705 | 706 | Returns 707 | ------- 708 | vqpi : Tuple[VPi, QPi, Policy] 709 | Value function, action-value function, policy and samples if any. 710 | samples : Tuple[int, List[Vpi], List[Qpi], List[ModelFreePolicy]] 711 | Samples taken during the simulation if any. The first element is the 712 | index of the iteration, the second is the value function, the third is 713 | the action-value function and the fourth is the TODO:. 714 | 715 | Raises 716 | ------ 717 | TransitionException: Ill defined transitions. 718 | ''' 719 | policy = _set_policy(policy, eps, actions, states) 720 | 721 | _typecheck_all(tabular_idxs=[states,actions], transition=transition, 722 | constants=[gamma, n, alpha, n_episodes, samples, max_steps], 723 | booleans=[optimize], policies=[policy]) 724 | 725 | sample_step = _get_sample_step(samples, n_episodes) 726 | 727 | model = ModelFree(states, actions, transition, gamma=gamma, policy=policy) 728 | 729 | v, q, samples = _n_tree_backup(model, state_0, action_0, n, alpha, int(n_episodes), 730 | max_steps, optimize, sample_step) 731 | 732 | return VQPi((v, q, policy)), samples 733 | 734 | 735 | def _n_tree_backup(MF, s_0, a_0, n, alpha, n_episodes, max_steps, 736 | optimize, sample_step): 737 | 738 | π, α, γ = MF.policy, alpha, MF.gamma 739 | 740 | v, q = MF.init_vq() 741 | 742 | samples = [] 743 | 744 | for n_episode in tqdm(range(n_episodes), desc=f'{n}-Tree Backup', unit='episodes'): 745 | s_0, a_0 = _set_s0_a0(MF, s_0, a_0) 746 | 747 | s = MF.states.get_index(s_0) 748 | a = MF.actions.get_index(a_0) 749 | T = int(max_steps) 750 | R, A, S, G = [], [a], [s], 0 751 | 752 | for t in range(T): 753 | if t < T: 754 | (s, r), end = MF.step_transition(s, a) 755 | R.append(r) 756 | S.append(s) 757 | if end: 758 | T = t + 1 759 | else: 760 | _, a = MF.random_sa() 761 | A.append(a) 762 | 763 | tau = t - n + 1 764 | if tau >= 0: 765 | if t + 1 >= T: 766 | G = R[-1] 767 | else: 768 | G = R[t] + γ*np.dot(π.pi[s[t]], q[s[t]]) 769 | 770 | for k in range(min(t, T-1), tau): 771 | G = R[k-1] + γ*np.dot(π.pi[s[k-1]], q[s[k-1]]) + \ 772 | γ*π.pi[s[k-1],A[k-1]]*(G-q[s[k-1], A[k-1]]) 773 | 774 | q[S[tau], A[tau]] = q[S[tau], A[tau]] + α[G-q[S[tau], A[tau]]] 775 | 776 | if optimize: 777 | π.update_policy(q, S[tau]) 778 | 779 | if tau == T - 1: 780 | break 781 | 782 | if n_episode % sample_step == 0: 783 | samples.append(get_sample(MF, v, q, π, n_episode, optimize)) 784 | 785 | return v, q, samples -------------------------------------------------------------------------------- /rl/solvers/planning.py: -------------------------------------------------------------------------------- 1 | from typing import ( 2 | Tuple, 3 | Sequence, 4 | Any 5 | ) 6 | 7 | from tqdm import tqdm 8 | import numpy as np 9 | from numpy.linalg import norm as lnorm 10 | 11 | from rl.solvers.model_free import ( 12 | get_sample, 13 | _set_s0_a0, 14 | _set_policy, 15 | ) 16 | from rl.model_free import ModelFree, ModelFreePolicy 17 | from rl.utils import ( 18 | UCTree, 19 | UCTNode, 20 | _typecheck_all, 21 | _get_sample_step, 22 | _check_ranges, 23 | VQPi, 24 | Samples, 25 | Transition, 26 | Vpi, 27 | Qpi, 28 | PQueue, 29 | MAX_ITER, 30 | MAX_STEPS, 31 | TOL 32 | ) 33 | 34 | 35 | def dynaq(states: Sequence[Any], actions: Sequence[Any], transition: Transition, 36 | state_0: Any=None, action_0: Any=None, gamma: float=1.0, kappa: float=0.01, 37 | n: int=1, plus: bool=False, alpha: float=0.05, n_episodes: int=MAX_ITER, 38 | policy: ModelFreePolicy=None, eps: float=None, samples: int=1000, 39 | max_steps: int=MAX_STEPS) -> Tuple[VQPi, Samples]: 40 | ''' 41 | TODO: docs 42 | ''' 43 | policy = _set_policy(policy, eps, actions, states) 44 | 45 | _typecheck_all(tabular_idxs=[states,actions], transition=transition, 46 | constants=[gamma, kappa, n, alpha, n_episodes, samples, max_steps], 47 | booleans=[plus], policies=[policy]) 48 | 49 | # check ranges 50 | 51 | sample_step = _get_sample_step(samples, n_episodes) 52 | 53 | model = ModelFree(states, actions, transition, gamma=gamma, policy=policy) 54 | v, q, samples = _dyna_q(model, state_0, action_0, n, alpha, kappa, plus, 55 | int(n_episodes), max_steps, sample_step) 56 | 57 | return VQPi((v, q, policy)), samples 58 | 59 | 60 | def _dyna_q(MF, s_0, a_0, n, alpha, kappa, plus, n_episodes, max_steps, 61 | sample_step): 62 | 63 | π, α, γ, κ = MF.policy, alpha, MF.gamma, kappa 64 | 65 | v, q = MF.init_vq() 66 | 67 | S, A = MF.states.N, MF.actions.N 68 | model_sas = np.zeros((S, A), dtype=int) 69 | model_sar = np.zeros((S, A), dtype=float) 70 | times_sa = np.zeros((S, A), dtype=int) 71 | 72 | samples = [] 73 | current_t = 0 74 | for n_episode in tqdm(range(n_episodes), desc='Dyna-Q', unit='episodes'): 75 | s_0, _ = _set_s0_a0(MF, s_0, None) 76 | 77 | s = MF.states.get_index(s_0) 78 | T = int(max_steps) 79 | 80 | for t in range(T): 81 | a = π(s) 82 | (s_, r), end = MF.step_transition(s, a) # real next state 83 | q[s, a] = q[s, a] + α*(r + γ*np.max(q[s_]) - q[s, a]) 84 | 85 | times_sa[s, a] = current_t 86 | 87 | # assuming deterministic environment 88 | model_sas[s, a] = s_ 89 | model_sar[s, a] = r 90 | 91 | current_t += 1 92 | 93 | for _ in range(n): 94 | rs, ra = MF.random_sa() 95 | s_m = model_sas[rs, ra] # model next state 96 | r_ = model_sar[rs, ra] 97 | R = r_ 98 | if plus: 99 | tau = current_t - times_sa[rs, ra] 100 | R = R + κ*np.sqrt(tau) 101 | q[rs, ra] = q[rs, ra] + α*(R + γ*np.max(q[s_m]) - q[rs, ra]) 102 | 103 | π.update_policy(q, s_) 104 | s = s_ # current state equal next state 105 | if end: 106 | break 107 | 108 | if n_episode % sample_step == 0: 109 | samples.append(get_sample(MF, v, q, π, n_episode, True)) 110 | 111 | return v, q, samples 112 | 113 | 114 | def priosweep(states: Sequence[Any], actions: Sequence[Any], transition: Transition, 115 | state_0: Any=None, action_0: Any=None, gamma: float=1.0, theta: float=0.01, 116 | n: int=1, plus: bool=False, alpha: float=0.05, n_episodes: int=MAX_ITER, 117 | policy: ModelFreePolicy=None, eps: float=None, samples: int=1000, 118 | max_steps: int=MAX_STEPS) -> Tuple[VQPi, Samples]: 119 | ''' 120 | TODO: docs 121 | ''' 122 | policy = _set_policy(policy, eps, actions, states) 123 | 124 | _typecheck_all(tabular_idxs=[states, actions], transition=transition, 125 | constants=[gamma, theta, n, alpha, n_episodes, samples, max_steps], 126 | booleans=[plus], policies=[policy]) 127 | 128 | # check ranges 129 | 130 | sample_step = _get_sample_step(samples, n_episodes) 131 | 132 | model = ModelFree(states, actions, transition, gamma=gamma, policy=policy) 133 | v, q, samples = _priosweep(model, state_0, action_0, n, alpha, theta, 134 | int(n_episodes), max_steps, sample_step) 135 | 136 | return VQPi((v, q, policy)), samples 137 | 138 | 139 | def _priosweep(MF, s_0, a_0, n, alpha, theta, n_episodes, max_steps, 140 | sample_step): 141 | 142 | π, α, γ = MF.policy, alpha, MF.gamma 143 | v, q = MF.init_vq() 144 | 145 | P, Pq, θ = 0, PQueue([]), theta 146 | 147 | S, A = MF.states.N, MF.actions.N 148 | model_sas = np.zeros((S, A), dtype=int) 149 | model_sar = np.zeros((S, A), dtype=float) 150 | times_sa = np.zeros((S, A), dtype=int) 151 | 152 | samples, current_t = [], 0 153 | for n_episode in tqdm(range(n_episodes), desc='priosweep', unit='episodes'): 154 | s_0, _ = _set_s0_a0(MF, s_0, None) 155 | 156 | s = MF.states.get_index(s_0) 157 | T = int(max_steps) 158 | 159 | for t in range(T): 160 | a = π(s) 161 | (s_, r), end = MF.step_transition(s, a) # real next state 162 | times_sa[s, a] = current_t 163 | model_sas[s, a] = s_ 164 | model_sar[s, a] = r 165 | 166 | P = np.abs(r + γ*np.max(q[s_]) - q[s, a]) 167 | if P > θ: 168 | Pq.push((s, a), P) 169 | 170 | current_t += 1 171 | 172 | for _ in range(n): 173 | if Pq.empty(): 174 | break 175 | 176 | ps, pa = Pq.pop() 177 | s_m = model_sas[ps, pa] # model next state 178 | r_ = model_sar[ps, pa] 179 | R = r_ 180 | 181 | q[ps, pa] = q[ps, pa] + α*(R + γ*np.max(q[s_m]) - q[ps, pa]) 182 | 183 | # grab all the index where model_sas == s 184 | mmask = (model_sas == s) 185 | for ss, aa in zip(*np.where(mmask)): 186 | rr = model_sar[ss, aa] 187 | P = np.abs(rr + γ*np.max(q[s]) - q[ss, aa]) 188 | if P > θ: 189 | Pq.push((s, a), P) 190 | 191 | π.update_policy(q, s_) 192 | s = s_ # current state equal next state 193 | if end: 194 | break 195 | 196 | if n_episode % sample_step == 0: 197 | samples.append(get_sample(MF, v, q, π, n_episode, True)) 198 | 199 | return v, q, samples 200 | 201 | 202 | def t_sampling(states: Sequence[Any], actions: Sequence[Any], transition: Transition, 203 | state_0: Any=None, action_0: Any=None, gamma: float=1.0, 204 | n_episodes: int=MAX_ITER, policy: ModelFreePolicy=None, eps: float=None, 205 | samples: int=1000, optimize: bool=False, max_steps: int=MAX_STEPS 206 | ) -> Tuple[VQPi, Samples]: 207 | ''' 208 | TODO: docs 209 | ''' 210 | policy = _set_policy(policy, eps, actions, states) 211 | 212 | _typecheck_all(tabular_idxs=[states,actions], transition=transition, 213 | constants=[gamma, n_episodes, samples, max_steps], 214 | booleans=[optimize], policies=[policy]) 215 | 216 | # TODO: check ranges 217 | 218 | sample_step = _get_sample_step(samples, n_episodes) 219 | 220 | model = ModelFree(states, actions, transition, gamma=gamma, policy=policy) 221 | v, q, samples = _t_sampling(model, state_0, action_0, int(n_episodes), 222 | optimize, max_steps, sample_step) 223 | 224 | return VQPi((v, q, policy)), samples 225 | 226 | 227 | def _t_sampling(MF, s_0, a_0, n_episodes, optimize, 228 | max_steps, sample_step): 229 | 230 | π, γ = MF.policy, MF.gamma 231 | v, q = MF.init_vq() 232 | 233 | S, A = MF.states.N, MF.actions.N 234 | n_sas = np.zeros((S, A, S), dtype=int) # p(s'|s,a) 235 | model_sar = np.zeros((S, A, S), dtype=float) # r(s,a,s') deterministic reward 236 | 237 | samples = [] 238 | for n_episode in tqdm(range(n_episodes), desc='Trajectory Sampling', unit='episodes'): 239 | s, a = _set_s0_a0(MF, s_0, a_0) 240 | a_ = MF.actions.get_index(a) 241 | s = MF.states.get_index(s) 242 | 243 | for _ in range(int(max_steps)): 244 | (s_, r), end = MF.step_transition(s, a_) # real next state 245 | 246 | n_sas[s, a, s_] += 1 247 | model_sar[s, a, s_] = r # assumes deterministic reward 248 | 249 | # p_sas is the probability of transitioning from s to s' 250 | p_sas = n_sas[s,a]/np.sum(n_sas[s, a]) 251 | next_s_mask = np.where(p_sas)[0] 252 | max_q = np.max(q[next_s_mask, :], axis=1) 253 | r_ns = model_sar[s, a, next_s_mask] 254 | p_ns = p_sas[next_s_mask] 255 | 256 | q[s, a] = np.dot(p_ns, r_ns + γ*max_q) 257 | 258 | π.update_policy(q, s) 259 | a_ = π(s_) 260 | s = s_ 261 | 262 | if end: 263 | break 264 | 265 | if n_episode % sample_step == 0: 266 | samples.append(get_sample(MF, v, q, π, n_episode, optimize)) 267 | 268 | return v, q, samples 269 | 270 | 271 | def rtdp(): 272 | raise NotImplementedError 273 | 274 | 275 | 276 | def _best_child(v, Cp): 277 | actions = np.array(list(v.children.keys())) 278 | qs = np.array([v.children[a].q for a in actions]) 279 | ns = np.array([v.children[a].n for a in actions]) 280 | ucb = qs/ns + Cp*np.sqrt(np.log(v.n)/ns) 281 | return v.children[actions[np.argmax(ucb)]] 282 | 283 | 284 | def _expand(v, transition, actions): 285 | a = np.random.choice(list(actions)) 286 | (s_, _), end = transition(v.state, a) 287 | v_prime = UCTNode(s_, a, 0, 1, v, end) 288 | v.children[a] = v_prime 289 | return v_prime 290 | 291 | 292 | def _tree_policy(tree, Cp, transition, action_map, eps): 293 | v = tree.root 294 | while not v.is_terminal: 295 | actions = action_map(v.state) 296 | took_actions = v.children.keys() 297 | unexplored = set(actions) - set(took_actions) 298 | if not took_actions: 299 | return _expand(v, transition, actions) 300 | if unexplored and np.random.rand() < eps: 301 | return _expand(v, transition, unexplored) 302 | v = _best_child(v, Cp) 303 | return v 304 | 305 | 306 | def _default_policy(v_leaf, transition, action_map, max_steps): 307 | step, r = 0, 0 308 | s = v_leaf.state 309 | 310 | if v_leaf.is_terminal: 311 | return r 312 | 313 | while step < max_steps: 314 | actions = action_map(s) 315 | a = np.random.choice(actions) 316 | (s, _r), end = transition(s, a) 317 | r += _r 318 | if end: 319 | return 1 320 | step += 1 321 | return -1 322 | 323 | 324 | def _backup(v_leaf, delta): 325 | v = v_leaf 326 | while v: 327 | v.n += 1 328 | v.q += delta 329 | v = v.parent 330 | 331 | 332 | def mcts(s0, Cp, budget, transition, action_map, max_steps, tree=None, 333 | eps=1, verbose=True): 334 | ''' 335 | Effectively implementing the UCT search algorithm 336 | ''' 337 | s = s0 338 | if not tree: 339 | tree = UCTree(s, Cp) 340 | for _ in tqdm(range(budget), desc='MCTS', disable=not verbose): 341 | v_leaf = _tree_policy(tree, Cp, transition, action_map, eps) 342 | delta = _default_policy(v_leaf, transition, action_map, max_steps) 343 | _backup(v_leaf, delta) 344 | 345 | v_best = _best_child(tree.root, 0) 346 | return v_best.action, tree 347 | -------------------------------------------------------------------------------- /rl/tiles.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tile Coding Software version 3.0beta 3 | by Rich Sutton 4 | based on a program created by Steph Schaeffer and others 5 | External documentation and recommendations on the use of this code is available in the 6 | reinforcement learning textbook by Sutton and Barto, and on the web. 7 | These need to be understood before this code is. 8 | 9 | This software is for Python 3 or more. 10 | 11 | This is an implementation of grid-style tile codings, based originally on 12 | the UNH CMAC code (see http://www.ece.unh.edu/robots/cmac.htm), but by now highly changed. 13 | Here we provide a function, "tiles", that maps floating and integer 14 | variables to a list of tiles, and a second function "tiles-wrap" that does the same while 15 | wrapping some floats to provided widths (the lower wrap value is always 0). 16 | 17 | The float variables will be gridded at unit intervals, so generalization 18 | will be by approximately 1 in each direction, and any scaling will have 19 | to be done externally before calling tiles. 20 | 21 | Num-tilings should be a power of 2, e.g., 16. To make the offsetting work properly, it should 22 | also be greater than or equal to four times the number of floats. 23 | 24 | The first argument is either an index hash table of a given size (created by (make-iht size)), 25 | an integer "size" (range of the indices from 0), or nil (for testing, indicating that the tile 26 | coordinates are to be returned without being converted to indices). 27 | """ 28 | from math import floor, log 29 | from itertools import zip_longest 30 | 31 | basehash = hash 32 | 33 | class IHT: 34 | "Structure to handle collisions" 35 | def __init__(self, sizeval): 36 | self.size = sizeval 37 | self.overfullCount = 0 38 | self.dictionary = {} 39 | 40 | def __str__(self): 41 | "Prepares a string for printing whenever this object is printed" 42 | return "Collision table:" + \ 43 | " size:" + str(self.size) + \ 44 | " overfullCount:" + str(self.overfullCount) + \ 45 | " dictionary:" + str(len(self.dictionary)) + " items" 46 | 47 | def count(self): 48 | return len(self.dictionary) 49 | 50 | def fullp(self): 51 | return len(self.dictionary) >= self.size 52 | 53 | def getindex(self, obj, readonly=False): 54 | d = self.dictionary 55 | if obj in d: return d[obj] 56 | elif readonly: return None 57 | size = self.size 58 | count = self.count() 59 | if count >= size: 60 | if self.overfullCount==0: print('IHT full, starting to allow collisions') 61 | self.overfullCount += 1 62 | return basehash(obj) % self.size 63 | else: 64 | d[obj] = count 65 | return count 66 | 67 | def hashcoords(coordinates, m, readonly=False): 68 | if type(m)==IHT: return m.getindex(tuple(coordinates), readonly) 69 | if type(m)==int: return basehash(tuple(coordinates)) % m 70 | if m==None: return coordinates 71 | 72 | 73 | def tiles(ihtORsize, numtilings, floats, ints=[], readonly=False): 74 | """returns num-tilings tile indices corresponding to the floats and ints""" 75 | qfloats = [floor(f*numtilings) for f in floats] 76 | Tiles = [] 77 | for tiling in range(numtilings): 78 | tilingX2 = tiling*2 79 | coords = [tiling] 80 | b = tiling 81 | for q in qfloats: 82 | coords.append( (q + b) // numtilings ) 83 | b += tilingX2 84 | coords.extend(ints) 85 | Tiles.append(hashcoords(coords, ihtORsize, readonly)) 86 | return Tiles 87 | 88 | def tileswrap(ihtORsize, numtilings, floats, wrapwidths, ints=[], readonly=False): 89 | """returns num-tilings tile indices corresponding to the floats and ints, wrapping some floats""" 90 | qfloats = [floor(f*numtilings) for f in floats] 91 | Tiles = [] 92 | for tiling in range(numtilings): 93 | tilingX2 = tiling*2 94 | coords = [tiling] 95 | b = tiling 96 | for q, width in zip_longest(qfloats, wrapwidths): 97 | c = (q + b%numtilings) // numtilings 98 | coords.append(c%width if width else c) 99 | b += tilingX2 100 | coords.extend(ints) 101 | Tiles.append(hashcoords(coords, ihtORsize, readonly)) 102 | return Tiles 103 | -------------------------------------------------------------------------------- /rl/utils.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import warnings 3 | from typing import ( 4 | Any, 5 | Sequence, 6 | List, 7 | Tuple, 8 | Callable, 9 | NewType 10 | ) 11 | 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | 15 | plt.style.use("dark_background") 16 | 17 | MAX_STEPS = 1E3 18 | MAX_ITER = int(1E4) 19 | TOL = 5E-8 20 | MEAN_ITERS = int(1E4) 21 | W_INIT = 1E-3 22 | 23 | class Policy(ABC): 24 | def __init__(self): 25 | pass 26 | 27 | @abstractmethod 28 | def __call__(self, state: int = None) -> int: 29 | raise NotImplementedError 30 | 31 | @abstractmethod 32 | def update_policy(self, *args, **kwargs): 33 | raise NotImplementedError 34 | 35 | 36 | class _TabularIndexer(): 37 | '''Simple proxy for tabular state & actions.''' 38 | def __init__(self, seq: Sequence[Any]): 39 | self.seq = seq 40 | self.N = len(seq) 41 | self.index = {v: i for i, v in enumerate(seq)} 42 | self.revindex = {i: v for i, v in enumerate(seq)} 43 | 44 | def get_index(self, v) -> Any: 45 | return self.index[v] 46 | 47 | def from_index(self, idx) -> Any: 48 | return self.revindex[idx] 49 | 50 | def random(self, value=False): 51 | rnd_idx = np.random.choice(self.N) 52 | if value: 53 | return self.seq[rnd_idx] 54 | return rnd_idx 55 | 56 | 57 | class State(_TabularIndexer): 58 | pass 59 | 60 | 61 | class Action(_TabularIndexer): 62 | pass 63 | 64 | 65 | class StateAction(_TabularIndexer): 66 | pass 67 | 68 | 69 | class _TabularValues: 70 | def __init__(self, values: np.ndarray, idx: _TabularIndexer): 71 | self.v = values 72 | self.idx = idx 73 | self.idx_val = {k:v for k,v in zip(idx.index.keys(), values)} 74 | 75 | def values(self): 76 | return self.v 77 | 78 | 79 | class Vpi(_TabularValues): 80 | def __str__(self): 81 | return f'Vpi({self.v[:5]}...)' 82 | 83 | 84 | class Qpi(_TabularValues): 85 | def __str__(self): 86 | return f'Vpi({self.v[:5]}...)' 87 | 88 | 89 | VQPi = NewType('VQPi', Tuple[Vpi, Qpi, Policy]) 90 | Samples = NewType('Samples', Tuple[int, List[Vpi], List[Qpi], List[Policy]]) 91 | Transition = Callable[[Any, Any], Tuple[Tuple[Any, float], bool]] 92 | EpisodeStep = NewType('EpisodeStep', Tuple[int, int, float]) 93 | 94 | class TransitionException(Exception): 95 | pass 96 | 97 | class PQueue: 98 | '''Priority Queue''' 99 | def __init__(self, items: List[Tuple[float, Any]]): 100 | self.items = items 101 | self._sort() 102 | 103 | def _sort(self): 104 | self.items.sort(key=lambda x: x[0]) 105 | 106 | def push(self, item, priority): 107 | self.items.append((priority, item)) 108 | self._sort() 109 | 110 | def pop(self): 111 | return self.items.pop(0)[1] 112 | 113 | def empty(self): 114 | return len(self.items) == 0 115 | 116 | 117 | class RewardGenerator: 118 | DISTRIBUTION = { 119 | 'bernoulli': np.random.binomial, 120 | 'gaussian': np.random.normal, 121 | 'uniform': np.random.uniform, 122 | 'exponential': np.random.exponential, 123 | 'poisson': np.random.poisson, 124 | 'pareto': np.random.pareto, 125 | 'triangular': np.random.triangular, 126 | } 127 | 128 | @classmethod 129 | def generate(self, distribution='normal', *args, **kwargs) -> float: 130 | generator = self.DISTRIBUTION.get(distribution) 131 | if not generator: 132 | raise ValueError(f'Invalid distribution: {distribution}') 133 | return generator(*args, **kwargs) 134 | 135 | 136 | class UCTNode: 137 | def __init__(self, state, action, q, n, parent=None, is_terminal=False): 138 | self.state = state 139 | self.action = action 140 | self.q = q 141 | self.n = n 142 | self.parent = parent 143 | self.children = {} 144 | self.is_terminal = False 145 | 146 | def add_child(self, child): 147 | self.children[child.action] = child 148 | return child 149 | 150 | 151 | class UCTree: 152 | def __init__(self, root, Cp=1.0, max_steps=MAX_STEPS, nodes=None): 153 | if not isinstance(root, UCTNode): 154 | self.root = UCTNode(root, None, 0, 1, None) 155 | else: 156 | self.root = root 157 | self.Cp = Cp 158 | self.max_steps = max_steps 159 | self.nodes = {} if not nodes else nodes 160 | 161 | def max_depth(self): 162 | stack = [(self.root, 0)] 163 | max_depth = 0 164 | while stack: 165 | node, depth = stack.pop() 166 | max_depth = max(depth, max_depth) 167 | for child in node.children.values(): 168 | stack.append((child, depth+1)) 169 | return max_depth 170 | 171 | def plot(self): 172 | max_depth = self.max_depth() 173 | width = 4*max_depth 174 | height = max_depth 175 | stack = [(self.root, 0, 0, width)] 176 | treenodes = [] 177 | lines = [] 178 | while stack: 179 | node, depth, x, step = stack.pop() 180 | node_pos = (x + step/2, height-depth) 181 | treenodes.append(node_pos) 182 | if node.children: 183 | n_childs = len(node.children) 184 | step = step/n_childs 185 | for i, child in enumerate(node.children.values()): 186 | stack.append((child, depth+1, x+i*step, step)) 187 | lines.append((node_pos, (step/2 + x+i*step, height-depth-1))) 188 | 189 | fig = plt.figure(figsize=(10, 10)) 190 | ax = fig.add_subplot(111) 191 | ax.set_xticks([]) 192 | ax.set_yticks([]) 193 | for node in treenodes: 194 | ax.scatter(node[0], node[1], color='white', s=1) 195 | for line in lines: 196 | ax.plot([line[0][0], line[1][0]], 197 | [line[0][1], line[1][1]], 198 | color='white', linewidth=0.5) 199 | plt.show() 200 | 201 | 202 | def _typecheck_tabular_idxs(*args): 203 | for arg in args: 204 | if not isinstance(arg, (Sequence, np.ndarray)): 205 | raise TypeError( 206 | f"Tabular Indexes must be Sequence, not {type(arg)}") 207 | 208 | 209 | def _typecheck_transition(transition): 210 | if not isinstance(transition, Callable): 211 | raise TypeError( 212 | f"transition must be a Callable, not {type(transition)}") 213 | 214 | if transition.__code__.co_argcount != 2: 215 | raise TypeError( 216 | f"transition must have two positional arguments," 217 | f" not {transition.__code__.co_argcount}") 218 | 219 | 220 | def _typecheck_constants(*args): 221 | for arg in args: 222 | if not isinstance(arg, (float, int)): 223 | raise TypeError( 224 | f"Constants must be float or int, not {type(arg)}") 225 | 226 | 227 | def _typecheck_booleans(*args): 228 | for arg in args: 229 | if not isinstance(arg, bool): 230 | raise TypeError( 231 | f"Booleans must be bool, not {type(arg)}") 232 | 233 | def _typecheck_policies(*args): 234 | for arg in args: 235 | if not isinstance(arg, Policy): 236 | raise TypeError( 237 | f"Policies must be Policy, not {type(arg)}") 238 | 239 | 240 | def _typecheck_all(tabular_idxs=None, transition=None, constants=None, 241 | booleans=None, policies=None): 242 | if tabular_idxs: 243 | _typecheck_tabular_idxs(*tabular_idxs) 244 | if transition: 245 | _typecheck_transition(transition) 246 | if constants: 247 | _typecheck_constants(*constants) 248 | if booleans: 249 | _typecheck_booleans(*booleans) 250 | if policies: 251 | _typecheck_policies(*policies) 252 | 253 | 254 | def _get_sample_step(samples, n_episodes): 255 | if samples > n_episodes: 256 | samples = n_episodes 257 | if samples > 1E3: 258 | samples = int(1E3) 259 | sample_step = int(n_episodes / samples) 260 | return sample_step 261 | 262 | 263 | def _check_ranges(values, ranges): 264 | for v, r in zip(values, ranges): 265 | if v < r[0] or v > r[1]: 266 | raise ValueError(f"{v} is out of range {r}") 267 | 268 | 269 | def auto_cardinal(values, n, safe=True): 270 | if (n+1)**len(values) > 2.5E6: 271 | if safe: 272 | raise ValueError("Too many combinations, may cause memory error," 273 | "set safe=False to avoid raising this error") 274 | else: 275 | warnings.warn("Too many combinations, may cause memory error") 276 | prod = np.array(np.meshgrid(*[values for _ in range(n)])) 277 | return prod.T.reshape(-1, n) 278 | 279 | 280 | class BasisException(Exception): 281 | pass 282 | 283 | 284 | def get_basis(self, basis, cij) -> Callable[[np.ndarray], np.ndarray]: 285 | '''get basis function for linear approximator using polynomial or 286 | fourier base 287 | 288 | Parameters 289 | ---------- 290 | basis : str 291 | Basis function to use, either 'poly' or 'fourier' 292 | cij : np.ndarray 293 | Coefficients for the basis function 294 | 295 | Returns 296 | ------- 297 | basis: Callable[[np.ndarray], np.ndarray] 298 | This function will not work on arbitrary defined states. Just on 299 | ones defined as sequences or numpy arrays. Any other type will 300 | raise an error. 301 | ''' 302 | if basis == 'poly': 303 | def _basis(s): 304 | xs = [np.prod(s**cj) for cj in cij] 305 | return np.array(xs) 306 | 307 | if basis == 'fourier': 308 | def _basis(s): 309 | xs = [np.cos(np.pi*np.dot(s, cj)) for cj in cij] 310 | return np.array(xs) 311 | 312 | def basis_f(s): 313 | try: 314 | return _basis(s) 315 | except Exception: 316 | raise BasisException('State must be a sequence or numpy array') 317 | return basis_f -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | setup(name = 'rl', packages = find_packages()) --------------------------------------------------------------------------------