├── environnements
    ├── __init__.py
    ├── oceanEnv.py
    ├── contextualBanditEnv.py
    └── nimEnv.py
├── requirements.txt
├── figure
    ├── nim_env.png
    ├── bandit_env.png
    ├── env_render.png
    ├── ocean_env.jpeg
    ├── slide_intro.png
    ├── slide_example.png
    ├── streamlit_example.png
    ├── DP
    │   ├── policy_iteration.gif
    │   ├── value_iteration.gif
    │   ├── q_values_joinBeach_estimated.gif
    │   ├── q_values_leaveBeach_estimated.gif
    │   ├── v_values_joinBeach_estimated.gif
    │   ├── v_values_leaveBeach_estimated.gif
    │   ├── q_values_swim_randomly_estimated.gif
    │   └── v_values_swim_randomly_estimated.gif
    ├── MC
    │   ├── MC_Control_eps_greedy.gif
    │   ├── q_values_joinBeach_estimated.gif
    │   ├── q_values_leaveBeach_estimated.gif
    │   ├── v_values_joinBeach_estimated.gif
    │   ├── v_values_leaveBeach_estimated.gif
    │   ├── q_values_swim_randomly_estimated.gif
    │   └── v_values_swim_randomly_estimated.gif
    └── TD
    │   ├── SARSA_Control_eps_greedy.gif
    │   ├── q_values_joinBeach_estimated.gif
    │   ├── q_values_leaveBeach_estimated.gif
    │   ├── v_values_joinBeach_estimated.gif
    │   ├── v_values_leaveBeach_estimated.gif
    │   ├── q_values_swim_randomly_estimated.gif
    │   └── v_values_swim_randomly_estimated.gif
├── .gitmodules
├── RL course EN v2022.pdf
├── RL course FR v2022.pdf
├── .github
    └── workflows
    │   └── sync_to_hf.yml
├── src
    ├── policies.py
    └── utils.py
├── DP
    ├── example_PI.py
    ├── example_IPE_leaveBeach.py
    ├── example_IPE_joinBeach.py
    ├── example_IPE_swimRandomly.py
    ├── plot_control_figures.py
    ├── plot_prediction_figures.py
    └── dynamicProgramming.py
├── .gitignore
├── TD
    ├── example_TD_control.py
    ├── plot_control_figures.py
    ├── example_TD_prediction.py
    ├── plot_prediction_figures.py
    └── TDLearning.py
├── MC
    ├── example_MC_control.py
    ├── plot_control_figures.py
    ├── example_MC_prediction.py
    └── plot_prediction_figures.py
├── README.md
├── playground_app
    ├── mappings.py
    └── playground.py
└── streamlit_app.py


/environnements/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gym
2 | numpy
3 | streamlit
4 | plotly==5.9.0
5 | altair==4.0.0


--------------------------------------------------------------------------------
/figure/nim_env.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/nim_env.png


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "gridworld_rl"]
2 | 	path = gridworld_rl
3 | 	url = git@github.com:tboulet/gridworld_rl.git
4 | 


--------------------------------------------------------------------------------
/figure/bandit_env.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/bandit_env.png


--------------------------------------------------------------------------------
/figure/env_render.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/env_render.png


--------------------------------------------------------------------------------
/figure/ocean_env.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/ocean_env.jpeg


--------------------------------------------------------------------------------
/RL course EN v2022.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/RL course EN v2022.pdf


--------------------------------------------------------------------------------
/RL course FR v2022.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/RL course FR v2022.pdf


--------------------------------------------------------------------------------
/figure/slide_intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/slide_intro.png


--------------------------------------------------------------------------------
/figure/slide_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/slide_example.png


--------------------------------------------------------------------------------
/figure/streamlit_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/streamlit_example.png


--------------------------------------------------------------------------------
/figure/DP/policy_iteration.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/policy_iteration.gif


--------------------------------------------------------------------------------
/figure/DP/value_iteration.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/value_iteration.gif


--------------------------------------------------------------------------------
/figure/MC/MC_Control_eps_greedy.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/MC_Control_eps_greedy.gif


--------------------------------------------------------------------------------
/figure/TD/SARSA_Control_eps_greedy.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/SARSA_Control_eps_greedy.gif


--------------------------------------------------------------------------------
/figure/DP/q_values_joinBeach_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/q_values_joinBeach_estimated.gif


--------------------------------------------------------------------------------
/figure/DP/q_values_leaveBeach_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/q_values_leaveBeach_estimated.gif


--------------------------------------------------------------------------------
/figure/DP/v_values_joinBeach_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/v_values_joinBeach_estimated.gif


--------------------------------------------------------------------------------
/figure/DP/v_values_leaveBeach_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/v_values_leaveBeach_estimated.gif


--------------------------------------------------------------------------------
/figure/MC/q_values_joinBeach_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/q_values_joinBeach_estimated.gif


--------------------------------------------------------------------------------
/figure/MC/q_values_leaveBeach_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/q_values_leaveBeach_estimated.gif


--------------------------------------------------------------------------------
/figure/MC/v_values_joinBeach_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/v_values_joinBeach_estimated.gif


--------------------------------------------------------------------------------
/figure/MC/v_values_leaveBeach_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/v_values_leaveBeach_estimated.gif


--------------------------------------------------------------------------------
/figure/TD/q_values_joinBeach_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/q_values_joinBeach_estimated.gif


--------------------------------------------------------------------------------
/figure/TD/q_values_leaveBeach_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/q_values_leaveBeach_estimated.gif


--------------------------------------------------------------------------------
/figure/TD/v_values_joinBeach_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/v_values_joinBeach_estimated.gif


--------------------------------------------------------------------------------
/figure/TD/v_values_leaveBeach_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/v_values_leaveBeach_estimated.gif


--------------------------------------------------------------------------------
/figure/DP/q_values_swim_randomly_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/q_values_swim_randomly_estimated.gif


--------------------------------------------------------------------------------
/figure/DP/v_values_swim_randomly_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/v_values_swim_randomly_estimated.gif


--------------------------------------------------------------------------------
/figure/MC/q_values_swim_randomly_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/q_values_swim_randomly_estimated.gif


--------------------------------------------------------------------------------
/figure/MC/v_values_swim_randomly_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/v_values_swim_randomly_estimated.gif


--------------------------------------------------------------------------------
/figure/TD/q_values_swim_randomly_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/q_values_swim_randomly_estimated.gif


--------------------------------------------------------------------------------
/figure/TD/v_values_swim_randomly_estimated.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/v_values_swim_randomly_estimated.gif


--------------------------------------------------------------------------------
/.github/workflows/sync_to_hf.yml:
--------------------------------------------------------------------------------
 1 | name: Sync to Hugging Face hub
 2 | on:
 3 |   push:
 4 |     branches: [main]
 5 | 
 6 |   # to run this workflow manually from the Actions tab
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   sync-to-hub:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |         with:
15 |           fetch-depth: 0
16 |       - name: Push to hub
17 |         env:
18 |           HF_TOKEN: ${{ secrets.HF_TOKEN }}
19 |         run: git push --force https://tboulet:$HF_TOKEN@huggingface.co/spaces/tboulet/RL-Playground main
20 | 


--------------------------------------------------------------------------------
/src/policies.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Policy: pass
 4 | class PolicyForDiscreteState(Policy): pass
 5 | 
 6 | class DiscretePolicyForDiscreteState(PolicyForDiscreteState):
 7 |     def __init__(self, probs : np.ndarray):
 8 |         self.probs = probs
 9 |         self.n_states, self.n_actions = probs.shape
10 |         """
11 |         Example for 2 state and 4 actions.
12 |         >>> probs = np.array([[0.1, 0.1, 0.7, 0.1], [0.7, 0.1, 0.2, 0.]])
13 |         >>> policy = DiscretePolicyForDiscreteState(probs)
14 |         >>> state = 0
15 |         >>> action = 0
16 |         >>> prob_to_do_action_in_state = policy.get_prob(state, action)
17 |         """
18 |     
19 |     def get_prob(self, state : int, action : int) -> float:
20 |         return self.probs[state, action]
21 | 
22 | 


--------------------------------------------------------------------------------
/DP/example_PI.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from src.utils import *
 4 | from environnements.oceanEnv import transition_probability_ocean, reward_probability_ocean
 5 | from DP.dynamicProgramming import PolicyIteration
 6 | 
 7 | algo_IP = PolicyIteration()
 8 | 
 9 | print("\nFinding optimal policy...")
10 | ideal_policy, action_values = algo_IP.find_optimal_policy(transition_probability_ocean, 
11 |                                             reward_probability_ocean, 
12 |                                             gamma=.98,
13 |                                             n_iterations=5,
14 |                                             verbose=1,
15 |                                             return_action_values=True,
16 |                                             )
17 | print("Optimal policy:", ideal_policy.probs)
18 | print("Final action values:", action_values)
19 | 
20 | print("\nPolicy during the learning:")
21 | src.policies_and_actions = algo_IP.find_optimal_policy_yielding( transition_probability_ocean, 
22 |                                             reward_probability_ocean, 
23 |                                             gamma=.98,
24 |                                             n_iterations=5,
25 |                                             return_action_values=True,
26 |                                             )
27 | for elem in src.policies_and_actions:
28 |     print(elem)
29 | 


--------------------------------------------------------------------------------
/environnements/oceanEnv.py:
--------------------------------------------------------------------------------
 1 | from src.utils import *
 2 | import gym
 3 | from gym import spaces
 4 | 
 5 | class OceanEnv(gym.Env):
 6 | 
 7 |     def __init__(self):
 8 |         self.action_space = spaces.Discrete(2)
 9 |         self.observation_space = spaces.Discrete(11)
10 |         super().__init__()
11 | 
12 |     def reset(self) -> Observation:
13 |         self.state = 10
14 |         return self.state
15 | 
16 |     def step(self, action: Action) -> Tuple[Observation, Reward, bool]:
17 |         assert action == 0 or action == 1, "Action must be in {0, 1} for the OceanEnv environnement."
18 |         assert 1 <= self.state <= 10, "The agent should be between 1 and 10 meters when step is called."
19 |         
20 |         # Action has effect on the environment
21 |         if action == 0:
22 |             self.state -= 1
23 |         elif action == 1:
24 |             self.state += 1
25 |             if self.state > 10: self.state = 10
26 |         
27 |         # Compute reward
28 |         reward = -1
29 | 
30 |         # Check if env is terminated
31 |         done = self.state <= 0
32 | 
33 |         return self.state, reward, done, {}
34 | 
35 | 
36 | 
37 | 
38 | 
39 |     def render(self):
40 |         print(f"Agent is at {self.state} meters of the beach.")
41 | 
42 | 
43 | 
44 | import numpy as np
45 | transition_probability_ocean = np.array([[[0 for _ in range(11)] for _ in range(2)] for _ in range(11)])
46 | reward_probability_ocean = np.array([[0 for _ in range(2)] for _ in range(11)])
47 | env = OceanEnv()
48 | for state in range(1, 11):
49 |     for action in [0, 1]:
50 |         env.state = state
51 |         next_state, reward, done, info = env.step(action)
52 |         transition_probability_ocean[state, action, next_state] = 1               
53 |         reward_probability_ocean[state, action] = reward
54 | 
55 | if __name__ == "__main__":
56 |     print("Transition probability:", transition_probability_ocean)
57 |     print("Reward probability:", reward_probability_ocean)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | venv*
  2 | test.py 
  3 | prep/
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 


--------------------------------------------------------------------------------
/TD/example_TD_control.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from src.utils import *
 4 | from environnements.oceanEnv import env
 5 | from TD.TDLearning import SARSA
 6 | 
 7 | algo_SARSA = SARSA()
 8 | 
 9 | print("\nFinding optimal policy...")
10 | optimal_policy, action_values = algo_SARSA.find_optimal_policy( env = env,
11 |                                                                 gamma=.98,
12 |                                                                 n_episodes = 200,
13 |                                                                 n_steps = float("inf"),
14 |                                                                 exploration_method='epsilon_greedy',
15 |                                                                 epsilon=.1,
16 |                                                                 alpha=.5,
17 |                                                                 timelimit=40,
18 |                                                                 return_action_values=True,
19 |                                                                 initial_action_values="random",
20 |                                                                 typical_value=-10,
21 |                                                                 is_state_done=lambda state: state == 0,
22 |                                                                 verbose=1,     
23 |                                                                 )
24 | print("Optimal policy's probs:", optimal_policy.probs)
25 | print("Final action values:", action_values)
26 | 
27 | print("\nActions and action values during the learning:")
28 | for elem in algo_SARSA.find_optimal_policy_yielding(env = env,
29 |                                                     gamma=.98,
30 |                                                     n_episodes = 10,
31 |                                                     n_steps = float("inf"),
32 |                                                     exploration_method='epsilon_greedy',
33 |                                                     epsilon=.1,
34 |                                                     alpha=.5,
35 |                                                     timelimit=40,
36 |                                                     return_action_values=True,
37 |                                                     initial_action_values="random",
38 |                                                     typical_value=-10,
39 |                                                     is_state_done=lambda state: state == 0,
40 |                                                     yielding_frequency="episode",
41 |                                                     ):
42 |     print(elem)
43 | 


--------------------------------------------------------------------------------
/MC/example_MC_control.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from src.utils import *
 4 | from environnements.oceanEnv import OceanEnv
 5 | from MC.monteCarlo import MonteCarlo
 6 | 
 7 | algo_MC = MonteCarlo()
 8 | n_iterations = 10
 9 | 
10 | print("\nFinding optimal policy...")
11 | optimal_policy, action_values = algo_MC.find_optimal_policy(env = OceanEnv(),
12 |                                                             gamma=.98,
13 |                                                             n_iterations=n_iterations,
14 |                                                             evaluation_episodes=100,
15 |                                                             exploration_method='epsilon_greedy',
16 |                                                             epsilon=.1,
17 |                                                             visit_method="first_visit",
18 |                                                             averaging_method="moving",
19 |                                                             alpha=.1,
20 |                                                             timelimit=40,
21 |                                                             return_action_values=True,
22 |                                                             initial_action_values="random",
23 |                                                             typical_value=-10,
24 |                                                             is_state_done=lambda state: state == 0,
25 |                                                             verbose=1,     
26 |                                                             )
27 | print("Optimal policy's probs:", optimal_policy.probs)
28 | print("Final action values:", action_values)
29 | 
30 | print("\nActions and action values during the learning:")
31 | for elem in algo_MC.find_optimal_policy_yielding(   env = OceanEnv(),
32 |                                                     gamma=.98,
33 |                                                     n_iterations=2,
34 |                                                     evaluation_episodes=3,
35 |                                                     exploration_method='epsilon_greedy',
36 |                                                     epsilon=.1,
37 |                                                     visit_method="first_visit",
38 |                                                     averaging_method="moving",
39 |                                                     alpha=.1,
40 |                                                     timelimit=40,
41 |                                                     initial_action_values="optimistic",
42 |                                                     typical_value=-10,
43 |                                                     is_state_done=lambda state: state == 0,
44 |                                                     yield_frequency="episode",
45 |                                                     ):
46 |     print(elem)
47 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple, Callable, Union
 2 | import numpy as np
 3 | 
 4 | class Observation: pass
 5 | class Action: pass
 6 | class Reward: pass
 7 | 
 8 | class RL_algorithm: pass
 9 | 
10 | class Q_State:
11 |     """A Q_State is a tuple of (observation, action)"""
12 |     def __init__(self, observation: Observation, action: Action):
13 |         self.observation = observation
14 |         self.action = action
15 |     def __hash__(self):
16 |         return hash((self.observation, self.action))
17 |     def __eq__(self, other):
18 |         return self.observation == other.observation and self.action == other.action
19 |     def __str__(self):
20 |         return f"({self.observation}, {self.action})"
21 | 
22 | class Scheduler(Callable):
23 |     """A Scheduler is a callable that given a number of episode or steps, returns the value of an hyper-parameter (learning rate, epsilon) to apply."""
24 |     def __init__(self, unit):
25 |         if not unit in ["episodes", "steps"]:
26 |             raise ValueError("Scheduler unit must be either 'episodes' or 'steps'")
27 |         self.unit = unit
28 |         super().__init__()
29 |     def __call__(self, timestep: Union[int, None], episode : Union[int, None]):
30 |         raise NotImplementedError("Scheduler must be implemented")
31 | 
32 | def pretty_announcer(string):
33 |     return    "\n==========================================================\n" \
34 |             + string \
35 |             + "\n==========================================================\n"
36 | 
37 | 
38 | def initialize_values( 
39 |         shape : Tuple,
40 |         initial_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array
41 |         typical_value : float = 1,
42 |         ) -> np.ndarray: 
43 |         """This method initialize the state or action values and return it.
44 |         shape : the shape of the values
45 |         initial_values : the initial values
46 |         typical_value : the typical value for the action values, used for scaling the "random" and "optimistic" value-initialization methods.
47 |         """
48 | 
49 | 
50 |         if type(initial_values) == str:
51 |             if initial_values == "random":
52 |                 values = np.random.normal(loc = 0, scale = abs(typical_value), size = shape)
53 |             elif initial_values == "zeros":
54 |                 values = np.zeros(shape)
55 |             elif initial_values == "optimistic":         # Optimistic initialization is a trick that consist to overestimate the action values initially. This increase exploration for the greedy algorithms.
56 |                 optimistic_value = 2 * typical_value if typical_value > 0 else typical_value / 2
57 |                 values = np.ones(shape) * optimistic_value     # An order of the magnitude of the reward is used to initialize optimistically the action values.
58 |             else:
59 |                 raise ValueError("The initial action values must be either 'random', 'zeros', 'optimistic' or a numpy array.")
60 |         elif isinstance(initial_values, np.ndarray):
61 |             values = initial_values
62 |         else:
63 |             raise ValueError("The initial action values must be either 'random', 'zeros', 'optimistic' or a numpy array.")
64 |         
65 |         return values


--------------------------------------------------------------------------------
/TD/plot_control_figures.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from matplotlib.animation import FuncAnimation
 4 | 
 5 | from src.utils import *
 6 | from environnements.oceanEnv import OceanEnv
 7 | from TD.TDLearning import SARSA
 8 | from src.policies import DiscretePolicyForDiscreteState
 9 | 
10 | algo_SARSA = SARSA()
11 | 
12 | S = np.arange(0,11)
13 | n_episodes = 50
14 | fps = 30
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | ### ====================================================================================================================== ###
23 | ### ============================================ Eps Greedy ============================================================== ###  
24 | ### ====================================================================================================================== ###
25 | 
26 | ### Plot the action values estimated through training
27 | src.policies_and_actions = algo_SARSA.find_optimal_policy_yielding(env = OceanEnv(),
28 |                                                     gamma=.98,
29 |                                                     n_episodes = n_episodes,
30 |                                                     n_steps = float("inf"),
31 |                                                     exploration_method='epsilon_greedy',
32 |                                                     epsilon=.1,
33 |                                                     alpha=.5,
34 |                                                     timelimit=40,
35 |                                                     return_action_values=True,
36 |                                                     initial_action_values="random",
37 |                                                     typical_value=-1,
38 |                                                     is_state_done=lambda state: state == 0,
39 |                                                     yielding_frequency="step",
40 |                                                     )
41 | 
42 | 
43 | results = [e.copy() if type(e) == np.ndarray else e for e in src.policies_and_actions]
44 | 
45 | bact = 4                                                                   
46 | fig, ax = plt.subplots()
47 | ax.set_xlim(-1, 11)
48 | ax.set_ylim(-20, bact + 2)
49 | ax.set_xlabel("s")
50 | title = "Algorithm starting"
51 | 
52 | actions_join, =ax.plot(S[results[0] == 0], [bact] * (len(S)-np.sum(results[0])), "<g")
53 | actions_leave, =ax.plot(S[results[0] == 1], [bact] * np.sum(results[0]), ">r")
54 | qvalues_closer, = ax.plot(S, results[1][:, 0], ".g", label = "Q(s,<)")
55 | qvalues_far,    = ax.plot(S, results[1][:, 1], "xr", label = "Q(s,>)")
56 | ax.legend()
57 | 
58 | def update(n):
59 |     data = results[n]
60 |     if type(data) == str:
61 |         ax.set_title(data)
62 |     elif type(data) == np.ndarray:
63 |         if len(data.shape) == 1:
64 |             actions_join.set_data(S[data == 0], [bact] * (len(S)-np.sum(data)))
65 |             actions_leave.set_data(S[data == 1], [bact] * np.sum(data))
66 |         elif len(data.shape) == 2:
67 |             qvalues_closer.set_ydata(data[:, 0])
68 |             qvalues_far.set_ydata(data[:, 1])
69 | 
70 | anim = FuncAnimation(   fig = fig,
71 |                         func = update,
72 |                         repeat = True,
73 |                         frames = np.arange(len(results)),
74 |                         interval = 20)
75 | 
76 | anim.save("figure/TD/SARSA_Control_eps_greedy.gif", writer = "ffmpeg", fps = fps)
77 | plt.show()


--------------------------------------------------------------------------------
/DP/example_IPE_leaveBeach.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from src.utils import *
 4 | from environnements.oceanEnv import transition_probability_ocean, reward_probability_ocean
 5 | from DP.dynamicProgramming import IterativePolicyEvaluation
 6 | from src.policies import DiscretePolicyForDiscreteState
 7 | 
 8 | policy_leave_beach = DiscretePolicyForDiscreteState(probs = np.array([[0, 1] for _ in range(11)]))
 9 | 
10 | algo_IPE = IterativePolicyEvaluation()
11 | 
12 | print("\nComputing state values for the policy join_beach...")
13 | estimated_state_values = algo_IPE.find_state_values(policy = policy_leave_beach,
14 |                                                     transition_probability = transition_probability_ocean,
15 |                                                     reward_probability = reward_probability_ocean,
16 |                                                     n_iterations = 5,
17 |                                                     maximal_error = 0.01,
18 |                                                     gamma=0.99)
19 | print("Estimated state values :", estimated_state_values)
20 | 
21 | print("\nEstimated state values during the learning:")
22 | estimated_state_values_during_training = algo_IPE.find_state_values_yielding(   policy = policy_leave_beach,
23 |                                                                                 transition_probability = transition_probability_ocean,
24 |                                                                                 reward_probability = reward_probability_ocean,
25 |                                                                                 n_iterations = 12,
26 |                                                                                 maximal_error = 0.01,
27 |                                                                                 gamma = 1)
28 | for n_iter, estimated_state_values in enumerate(estimated_state_values_during_training):
29 |     print(f"Iteration {n_iter} :", estimated_state_values)
30 | 
31 | print("\nComputing action values for the policy join_beach...")
32 | estimated_action_values = algo_IPE.find_action_values(  policy = policy_leave_beach,
33 |                                                         transition_probability=transition_probability_ocean,
34 |                                                         reward_probability=reward_probability_ocean,
35 |                                                         n_iterations=12,
36 |                                                         maximal_error=0.01,
37 |                                                         gamma=1)
38 | print("Estimated action values :", estimated_action_values)
39 | 
40 | print("\nEstimated action values during the learning:")
41 | estimated_action_values_during_training = algo_IPE.find_action_values_yielding( policy = policy_leave_beach,
42 |                                                                                 transition_probability = transition_probability_ocean,
43 |                                                                                 reward_probability = reward_probability_ocean,
44 |                                                                                 n_iterations = 12,
45 |                                                                                 maximal_error = 0.01,
46 |                                                                                 gamma = 1)
47 | for n_iter, estimated_action_values in enumerate(estimated_action_values_during_training):
48 |     print(f"Iteration {n_iter} :", estimated_action_values)


--------------------------------------------------------------------------------
/DP/example_IPE_joinBeach.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from src.utils import *
 4 | from environnements.oceanEnv import transition_probability_ocean, reward_probability_ocean
 5 | from DP.dynamicProgramming import IterativePolicyEvaluation
 6 | from src.policies import DiscretePolicyForDiscreteState
 7 | 
 8 | policy_join_beach = DiscretePolicyForDiscreteState(probs = np.array([[1, 0] for _ in range(11)]))
 9 | 
10 | algo_IPE = IterativePolicyEvaluation()
11 | 
12 | print("\nComputing state values for the policy join_beach...")
13 | estimated_state_values = algo_IPE.find_state_values(policy = policy_join_beach,
14 |                                                     transition_probability = transition_probability_ocean,
15 |                                                     reward_probability = reward_probability_ocean,
16 |                                                     n_iterations = 5,
17 |                                                     maximal_error = 0.01,
18 |                                                     gamma=1,)
19 | print("Estimated state values :", estimated_state_values)
20 | 
21 | print("\nEstimated state values during the learning:")
22 | estimated_state_values_during_training = algo_IPE.find_state_values_yielding(   policy = policy_join_beach,
23 |                                                                                 transition_probability = transition_probability_ocean,
24 |                                                                                 reward_probability = reward_probability_ocean,
25 |                                                                                 n_iterations = 12,
26 |                                                                                 maximal_error = 0.01,
27 |                                                                                 gamma = 1,
28 |                                                                                 )
29 | for n_iter, estimated_state_values in enumerate(estimated_state_values_during_training):
30 |     print(estimated_state_values)
31 | 
32 | print("\nComputing action values for the policy join_beach...")
33 | estimated_action_values = algo_IPE.find_action_values(  policy = policy_join_beach,
34 |                                                         transition_probability=transition_probability_ocean,
35 |                                                         reward_probability=reward_probability_ocean,
36 |                                                         n_iterations=12,
37 |                                                         maximal_error=0.01,
38 |                                                         gamma=1)
39 | print("Estimated action values :", estimated_action_values)
40 | 
41 | print("\nEstimated action values during the learning:")
42 | estimated_action_values_during_training = algo_IPE.find_action_values_yielding( policy = policy_join_beach,
43 |                                                                                 transition_probability = transition_probability_ocean,
44 |                                                                                 reward_probability = reward_probability_ocean,
45 |                                                                                 n_iterations = 12,
46 |                                                                                 maximal_error = 0.01,
47 |                                                                                 gamma = 1)
48 | for n_iter, estimated_action_values in enumerate(estimated_action_values_during_training):
49 |     print(estimated_action_values)


--------------------------------------------------------------------------------
/DP/example_IPE_swimRandomly.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from src.utils import *
 4 | from environnements.oceanEnv import transition_probability_ocean, reward_probability_ocean
 5 | from DP.dynamicProgramming import IterativePolicyEvaluation
 6 | from src.policies import DiscretePolicyForDiscreteState
 7 | 
 8 | policy_swim_randomly = DiscretePolicyForDiscreteState(probs = np.array([[0.8, 0.2] for _ in range(11)]))
 9 | 
10 | 
11 | algo_IPE = IterativePolicyEvaluation()
12 | 
13 | print("\nComputing state values for the policy swim_randomly...")
14 | estimated_state_values = algo_IPE.find_state_values(policy = policy_swim_randomly,
15 |                                                     transition_probability = transition_probability_ocean,
16 |                                                     reward_probability = reward_probability_ocean,
17 |                                                     n_iterations = 100,
18 |                                                     maximal_error = 0.01,
19 |                                                     gamma=.98)
20 | print("Estimated state values :", estimated_state_values)
21 | 
22 | print("\nEstimated state values during the learning:")
23 | estimated_state_values_during_training = algo_IPE.find_state_values_yielding(   policy = policy_swim_randomly,
24 |                                                                                 transition_probability = transition_probability_ocean,
25 |                                                                                 reward_probability = reward_probability_ocean,
26 |                                                                                 n_iterations = 1,
27 |                                                                                 maximal_error = 0.01,
28 |                                                                                 gamma = .98)
29 | for n_iter, estimated_state_values in enumerate(estimated_state_values_during_training):
30 |     print(f"Iteration {n_iter} :", estimated_state_values)
31 | 
32 | print("\nComputing action values for the policy swim_randomly...")
33 | estimated_action_values = algo_IPE.find_action_values(  policy = policy_swim_randomly,
34 |                                                         transition_probability=transition_probability_ocean,
35 |                                                         reward_probability=reward_probability_ocean,
36 |                                                         n_iterations=100,
37 |                                                         maximal_error=0.01,
38 |                                                         gamma=.98)
39 | print("Estimated action values :", estimated_action_values)
40 | 
41 | print("\nEstimated action values during the learning:")
42 | estimated_action_values_during_training = algo_IPE.find_action_values_yielding( policy = policy_swim_randomly,
43 |                                                                                 transition_probability = transition_probability_ocean,
44 |                                                                                 reward_probability = reward_probability_ocean,
45 |                                                                                 n_iterations = 1,
46 |                                                                                 maximal_error = 0.01,
47 |                                                                                 gamma = .98)
48 | for n_iter, estimated_action_values in enumerate(estimated_action_values_during_training):
49 |     print(f"Iteration {n_iter} :", estimated_action_values)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Formation-Reinforcement-Learning
 2 | This is the repository for the Reinforcement Learning course at Automatants, the AI student association of CentraleSupélec. The course was given to students of the CentraleSupélec campus as an introduction to Reinforcement Learning.
 3 | 
 4 | <p align="center">
 5 |   <img src="figure/slide_intro.png" width="60%"/>
 6 | </p>
 7 | 
 8 | Concepts covered in the first part (slides 1-39):
 9 | - RL Framework (Environment (with examples), MDP, Policy, Cumulative reward, State and Action Value)
10 | - Environnement shaping (Reward shaping, State shaping, Action shaping)
11 | - Prediction and Control problems
12 | - Model-based methods : Dynamic Programming (Bellman Equations, Policy Iteration, Value Iteration)
13 | 
14 | Concepts covered in the second part (slides 40-80):
15 | - Model-free methods : Monte Carlo, TD Learning (SARSA, Q-Learning, Expected SARSA), n-step TD Learning
16 | - Exploration-Exploitation Dilemma
17 | - Exploration Replay
18 | - Deep RL introduction
19 | - Deep Q Network (DQN)
20 | - Parallelization in RL
21 | - Librairies and ressources in RL
22 | 
23 | Policy-based RL methods and Importance Sampling are also covered in the slides (81 - 88), but not in the lectures.
24 | 
25 | 
26 | 
27 | 
28 | # Videos
29 | 
30 | Videos of the lectures are available (in French only) on the [Automatants Youtube channel](https://www.youtube.com/channel/UCZ2wKX6bJg9Yz9KdHkzjw1Q).
31 | 
32 | Part 1: Introduction to Reinforcement Learning and Model-based methods (RL Framework, Bellman Equations, Dynamic Programming)
33 | 
34 | - [Lecture 1: Introduction to Reinforcement Learning](https://www.youtube.com/watch?v=juNSptzWTJs)
35 | 
36 | Part 2: Model-free methods and deeper concepts in RL : Monte Carlo, TD Learning (SARSA, Q-Learning, Expected SARSA), Exploration-Exploitation Dilemma, Off-Policy Learning, Deep RL intro
37 | 
38 | - [Lecture 2: Deeper concepts in Reinforcement Learning](https://www.youtube.com/watch?v=LId8UpG_YY4)
39 | 
40 | # Slides 
41 | 
42 | Slides of the lectures are available in this repository in French and English in the as powerpoint files "slides ENGLISH.pptx" and "slides FR.pptx".
43 | 
44 | <p align="center">
45 |   <img src="figure/slide_example.png" width="60%"/>
46 | </p>
47 | 
48 | 
49 | # Gridworld environment
50 | 
51 | <p align="center">
52 |   <img src="figure/env_render.png" alt="Q values through training" width="60%"/>
53 | </p>
54 | 
55 | The Gridworld environment is available [here](https://github.com/tboulet/gridworld_rl). It was a simple gridworld environment developped to implement the algorithms seen in the lectures. The goal was to visualize Q values or probabilities of actions during the training of the agent. Several environments/grids (with different rewards, obstacles, etc.) and several agents (including your own) are available. More information on the GitHub repository.
56 | 
57 | # Streamlit app
58 | 
59 | You can visualize the results of the algorithms seen in the lectures and the influence of many hyperparameters with the Streamlit app. 
60 | 
61 | This include 3 environnements : OceanEnv (reach the goal as fast as possible), Nim (take the last stick) and a Contextual Bandit environment (choose the best arm at each state).
62 | <p align="center">
63 |   <img src="figure/streamlit_example.png" width="60%"/>
64 | </p>
65 | 
66 | The app is deployed with Streamlit and should be available [here](https://share.streamlit.io/tboulet/formation-reinforcement-learning/main/app.py).
67 | 
68 | If that is not the case, you can still install streamlit with pip and then run the app locally with the following command:
69 | ```bash
70 | streamlit run streamlit_app.py
71 | ```


--------------------------------------------------------------------------------
/environnements/contextualBanditEnv.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from src.utils import *
 4 | import gym
 5 | from gym import spaces
 6 | 
 7 | class ContextualBanditEnv(gym.Env):
 8 |     n_states = 4
 9 |     n_actions = n_states
10 |     is_terminal = True      # True if 1 episode= 1step, False for a non terminal episode
11 | 
12 |     means = [k for k in range(n_states)]
13 |     stds = [k+1 for k in range(n_states)]
14 | 
15 |     time_limit = float("inf")   # careful, if not +oo, may lead to strange behavior perhaps for TD
16 | 
17 |     def __init__(self):
18 |         # Define gym spaces
19 |         self.action_space = spaces.Discrete(self.n_actions)
20 |         self.observation_space = spaces.Discrete(self.n_states + int(self.is_terminal))
21 |         super().__init__()
22 | 
23 |     def reset(self) -> Observation:
24 |         # Define initial state
25 |         self.state = self.random_state()
26 |         return self.state
27 | 
28 |     def step(self, action : Action) -> Tuple[Observation, Reward, bool]:
29 |         # Check if action is valid (between 0 and num_rm - 1).
30 |         assert action in range(self.n_actions), "Action must be in {0, 1, ..., self.n_actions - 1} for the contextualBanditEnv environnement."
31 |         
32 |         # Action has effect on the environment
33 |         k = (self.state - action - 1) % self.n_states
34 |         mean, std = self.means[k], self.stds[k]
35 |         reward = np.random.normal(mean, std)
36 |         if self.is_terminal:
37 |             done = True
38 |             self.state = self.n_states
39 |         else:
40 |             done = False
41 |             self.state = self.random_state()
42 | 
43 |         return self.state, reward, done, {}
44 | 
45 |     def random_state(self):
46 |         return random.choice(range(self.n_states))
47 | 
48 |     def render(self, **kwargs):
49 |         pass
50 | 
51 | # For non terminal bandit problem
52 | if ContextualBanditEnv.is_terminal:
53 |     transition_probability_CB = np.array([[[0 for _ in range(ContextualBanditEnv.n_states + 1)] for _ in range(ContextualBanditEnv.n_actions)] for _ in range(ContextualBanditEnv.n_states + 1)], dtype = float)
54 |     for state in range(ContextualBanditEnv.n_states):
55 |         for action in range(ContextualBanditEnv.n_actions):
56 |             transition_probability_CB[state][action][ContextualBanditEnv.n_states] = 1
57 |     reward_probability_CB = np.array([[0 for _ in range(ContextualBanditEnv.n_actions)] for _ in range(ContextualBanditEnv.n_states + 1)], dtype = float)
58 |     for state in range(ContextualBanditEnv.n_states):
59 |         for action in range(ContextualBanditEnv.n_actions):
60 |             k = (state - action - 1) % ContextualBanditEnv.n_states
61 |             reward_probability_CB[state][action] = ContextualBanditEnv.means[k]
62 | # For terminal bandit problem (1 episode = 1 step)
63 | else:
64 |     transition_probability_CB = np.array([[[0 for _ in range(ContextualBanditEnv.n_states)] for _ in range(ContextualBanditEnv.n_actions)] for _ in range(ContextualBanditEnv.n_states)], dtype = float)
65 |     reward_probability_CB = np.array([[0 for _ in range(ContextualBanditEnv.n_actions)] for _ in range(ContextualBanditEnv.n_states)], dtype = float)
66 |     for state in range(ContextualBanditEnv.n_states):
67 |         for action in range(ContextualBanditEnv.n_actions):
68 |             k = (state - action - 1) % ContextualBanditEnv.n_states
69 |             reward_probability_CB[state][action] = ContextualBanditEnv.means[k]
70 | 
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     print("Transition probability for state 0 P(0,a,s'):", transition_probability_CB[0])
75 |     print("Reward probability:", reward_probability_CB)


--------------------------------------------------------------------------------
/MC/plot_control_figures.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from matplotlib.animation import FuncAnimation
 4 | 
 5 | from src.utils import *
 6 | from environnements.oceanEnv import OceanEnv
 7 | from MC.monteCarlo import MonteCarlo
 8 | from src.policies import DiscretePolicyForDiscreteState
 9 | 
10 | algo_MC = MonteCarlo()
11 | 
12 | n_iterations = 8
13 | n_iterations_evaluation = 40
14 | S = np.arange(0,11)
15 | fps = 30
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | ### ====================================================================================================================== ###
24 | ### ============================================ Eps Greedy ============================================================== ###  
25 | ### ====================================================================================================================== ###
26 | 
27 | ### Plot the action values estimated through training
28 | policies_and_actions = algo_MC.find_optimal_policy_yielding(    env = OceanEnv(),
29 |                                                                 gamma=.98,
30 |                                                                 n_iterations=n_iterations,
31 |                                                                 evaluation_episodes=n_iterations_evaluation,
32 |                                                                 exploration_method='epsilon_greedy',
33 |                                                                 epsilon=.1,
34 |                                                                 visit_method="first_visit",
35 |                                                                 averaging_method="moving",
36 |                                                                 alpha=.1,
37 |                                                                 timelimit=40,
38 |                                                                 initial_action_values="random",
39 |                                                                 typical_value=-10,
40 |                                                                 is_state_done=lambda state: state == 0,
41 |                                                               )
42 | 
43 | 
44 | results = [e.copy() if type(e) == np.ndarray else e for e in policies_and_actions]
45 | 
46 | bact = 4                                                                   
47 | fig, ax = plt.subplots()
48 | ax.set_xlim(-1, 11)
49 | ax.set_ylim(-20, bact + 2)
50 | ax.set_xlabel("s")
51 | title_control = f"MC Control : 0/{n_iterations}"
52 | title_prediction = f"MC Prediction : 0/{n_iterations_evaluation}"
53 | 
54 | actions_join, =ax.plot(S[results[0] == 0], [bact] * (len(S)-np.sum(results[0])), "<g")
55 | actions_leave, =ax.plot(S[results[0] == 1], [bact] * np.sum(results[0]), ">r")
56 | qvalues_closer, = ax.plot(S, results[1][:, 0], ".g", label = "Q(s,<)")
57 | qvalues_far,    = ax.plot(S, results[1][:, 1], "xr", label = "Q(s,>)")
58 | ax.legend()
59 | 
60 | def update(n):
61 |     global title_control, title_prediction
62 |     if n>= len(results):
63 |         ax.set_title("MC Control (ended)")
64 |         return
65 |     data = results[n]
66 |     if type(data) == str:
67 |         if "MC Control" in data:
68 |             title_control = data
69 |             ax.set_title(title_control + " - " + title_prediction)
70 |         elif "MC Prediction" in data:
71 |             title_prediction = data
72 |             ax.set_title(title_control + " - " + title_prediction)
73 |     elif type(data) == np.ndarray:
74 |         if len(data.shape) == 1:
75 |             actions_join.set_data(S[data == 0], [bact] * (len(S)-np.sum(data)))
76 |             actions_leave.set_data(S[data == 1], [bact] * np.sum(data))
77 |         elif len(data.shape) == 2:
78 |             qvalues_closer.set_ydata(data[:, 0])
79 |             qvalues_far.set_ydata(data[:, 1])
80 | 
81 | anim = FuncAnimation(   fig = fig,
82 |                         func = update,
83 |                         repeat = True,
84 |                         frames = np.arange(len(results)),
85 |                         interval = 20)
86 | plt.show()
87 | anim.save("figure/MC/MC_Control_eps_greedy.gif", writer = "ffmpeg", fps = 30)
88 | 


--------------------------------------------------------------------------------
/environnements/nimEnv.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from src.utils import *
 4 | import gym
 5 | from gym import spaces
 6 | 
 7 | class NimEnv(gym.Env):
 8 |     num_initial_objects = 21    # Number of objects initially in the pile
 9 |     num_rm = 3                  # Maximal number of objects removable each turn
10 | 
11 |     def __init__(self):
12 |         # Define gym spaces
13 |         self.action_space = spaces.Discrete(self.num_rm)
14 |         self.observation_space = spaces.Discrete(self.num_initial_objects + 1)
15 |         super().__init__()
16 |     
17 |     def reset(self) -> Observation:
18 |         # Define initial state
19 |         self.num_objects = self.num_initial_objects
20 |         return self.num_objects
21 | 
22 |     def step(self, action) -> Tuple[Observation, Reward, bool]:
23 |         # Check if action is valid (between 0 and num_rm - 1).
24 |         assert self.num_objects > 0, "The game should not be finished when step() is called."
25 |         assert action in range(self.num_rm), "Action must be in {0, 1, ..., self.num_rm - 1} for the nimEnv environnement."
26 |         action += 1
27 |         # Action has effect on the environment
28 |         self.num_objects -= action
29 |         # Compute reward and done
30 |         if self.num_objects <= 0:
31 |             reward = -1
32 |             done = True
33 |             self.num_objects = 0
34 |         else:
35 |             action_opponent = self.opponent_act(state = self.num_objects)
36 |             self.num_objects -= action_opponent
37 |             if self.num_objects <= 0:
38 |                 reward = 1
39 |                 done = True
40 |                 self.num_objects = 0
41 |             else:
42 |                 reward = 0
43 |                 done = False
44 |         # Return observation, reward, done, and info
45 |         return self.num_objects, reward, done, {}
46 | 
47 |     def opponent_act(self, state : Observation = None) -> Action:
48 |         # Choose action according to opponent policy (uniformly random)
49 |         action = random.choice(range(self.num_rm)) + 1
50 |         return action
51 | 
52 |     def render(self, **kwargs):
53 |         print(f"{self.num_objects}/{self.num_initial_objects} objects remaining.")
54 | 
55 | 
56 | import numpy as np
57 | n_states = NimEnv.num_initial_objects + 1
58 | n_actions = NimEnv.num_rm
59 | transition_probability_nim = np.array([[[0 for _ in range(n_states)] for _ in range(n_actions)] for _ in range(n_states)], dtype = float)
60 | reward_probability_nim = np.array([[0 for _ in range(n_actions)] for _ in range(n_states)], dtype = float)
61 | env = NimEnv()
62 | for state in range(1, n_states):
63 |     for action in range(n_actions):
64 |         num_objects_removed = action + 1
65 |         num_objects_remaining = state - num_objects_removed
66 | 
67 |         # Here the agent failed and remove the last object, reaching state 0 and receiving a reward of -1
68 |         if num_objects_remaining <= 0:
69 |             reward_probability_nim[state, action] = -1
70 |             transition_probability_nim[state, action, 0] = 1
71 |         # Here the agent did not remove the last object, and the opponent may remove it.
72 |         else:
73 |             prob = 1 / n_actions
74 |             for action_opponent in range(n_actions):
75 |                 num_objects_removed_opponent = action_opponent + 1
76 |                 num_objects_remaining_opponent = num_objects_remaining - num_objects_removed_opponent
77 |                 # Here the agent did not remove the last object, and the opponent did not remove it.
78 |                 if num_objects_remaining_opponent > 0:
79 |                     transition_probability_nim[state, action, num_objects_remaining_opponent] = prob
80 |                 # Here the agent did not remove the last object, and the opponent removed the last object.
81 |                 # The agent receives a reward of 1 and reaches state 0.
82 |                 else:
83 |                     reward_probability_nim[state, action] += prob
84 |                     transition_probability_nim[state, action, 0] += prob
85 | 
86 | if __name__ == "__main__":
87 |     print("Transition probability for state 5 P(5,a,s'):", transition_probability_nim[5])
88 |     print("Reward probability:", reward_probability_nim)


--------------------------------------------------------------------------------
/playground_app/mappings.py:
--------------------------------------------------------------------------------
 1 | from MC.monteCarlo import MonteCarlo
 2 | from DP.dynamicProgramming import IterativePolicyEvaluation, PolicyIteration, ValueIteration
 3 | from TD.TDLearning import TD, SARSA
 4 | 
 5 | from environnements.oceanEnv import OceanEnv, transition_probability_ocean, reward_probability_ocean
 6 | from environnements.nimEnv import NimEnv, transition_probability_nim, reward_probability_nim
 7 | from environnements.contextualBanditEnv import ContextualBanditEnv, transition_probability_CB, reward_probability_CB
 8 | 
 9 | map_name_to_algo = {"IterativePolicyEvaluation": {  "Algo": IterativePolicyEvaluation, 
10 |                                                     "family": "DP"},
11 |                     "PolicyIteration": {"Algo": PolicyIteration,
12 |                                         "family": "DP"},
13 |                     "ValueIteration":  {"Algo": ValueIteration,
14 |                                         "family": "DP"},
15 |                     "MonteCarlo": {     "Algo": MonteCarlo,
16 |                                         "family": "MC"},
17 |                     "TD(0)": {          "Algo": TD,
18 |                                         "family": "TD"},
19 |                     "SARSA" :           {"Algo" : SARSA,
20 |                                          "family": "TD"},
21 |                     
22 |                         }
23 | 
24 | map_name_to_env = { "Ocean Env": { "Env" : OceanEnv, 
25 |                                     "model" : (transition_probability_ocean, reward_probability_ocean),
26 |                                     "is_state_done" : lambda state : state == 0,
27 |                                     "range_values" : [-20, 5],
28 |                                     "image_path" : "figure/ocean_env.jpeg",
29 |                                     "description" : "In this environment you need to reach the beach as fast as possible. \
30 |                                     You start in the ocean and you can only move in the 2 directions.  \
31 |                                     The state consist of the distance with the beach and is represented by an integer between 0 and 10  \
32 |                                     (you can't go more far than 10). The reward is -1 at each step and 0 when you reach the beach.  \
33 |                                     The episode ends when you reach the beach. \
34 |                                     ",
35 |                                     },
36 |                     
37 |                     "Nim's Game" :  { "Env" : NimEnv, 
38 |                                     "model" : (transition_probability_nim, reward_probability_nim),
39 |                                     "is_state_done" : lambda state : state <= 0,
40 |                                     "range_values" : [-2, 2],
41 |                                     "image_path" : "figure/nim_env.png",
42 |                                     "description" : "In this game you start with 10 matches and you can remove 1, 2 or 3 matches at each step (those are your actions). The player that removes the last match loses. You play against a random agent. The state consist of the number of matches left and is represented by an integer between 0 and n_matches=25. The reward is 1 if you win, -1 if you lose and 0 if the game is not finished. The episode ends when the game is finished."
43 |                                     }, 
44 | 
45 |                     "n-Bandit Contextual" :  { "Env" : ContextualBanditEnv, 
46 |                                     "model" : (transition_probability_CB, reward_probability_CB),
47 |                                     "is_state_done" : lambda state : state == -1,
48 |                                     "range_values" : [-1, 4],
49 |                                     "image_path" : "figure/bandit_env.png",
50 |                                     "description" : "In this famous environment, which is a foundation problem of theoretical RL, you have a slot machine with 4 arms. Each arm ill give you a reward following a random law that you don't now. This is contextual because which arm is better depends on the state. In particular here, the expected reward is r(s,a) = (s-a-1)%4 so the optimal action for each state is pi*(s)=s.",
51 |                                     },        
52 |                                 
53 |                         }
54 | 
55 | map_problem_to_algo_names = {   "Prediction Problem" : ["MonteCarlo", "IterativePolicyEvaluation", "TD(0)", "SARSA"],
56 |                                 "Control Problem" : ["MonteCarlo", "PolicyIteration", "ValueIteration", "SARSA"],
57 |                                 }


--------------------------------------------------------------------------------
/TD/example_TD_prediction.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from src.utils import *
 4 | from environnements.oceanEnv import OceanEnv
 5 | from TD.TDLearning import TD, SARSA
 6 | from src.policies import DiscretePolicyForDiscreteState
 7 | 
 8 | policy_swim_randomly = DiscretePolicyForDiscreteState(probs = np.array([[0.8, 0.2] for _ in range(11)]))
 9 | 
10 | algo_TD = TD()
11 | algo_SARSA = SARSA()
12 | 
13 | print("\nComputing state values for the policy policy_swim_randomly...")
14 | estimated_state_values = algo_TD.find_state_values( policy = policy_swim_randomly,
15 |                                                     env = OceanEnv(),
16 |                                                     n_episodes = 100,
17 |                                                     n_steps = 100000,
18 |                                                     gamma=0.99,
19 |                                                     alpha=0.5,
20 |                                                     timelimit=40,
21 |                                                     initial_state_values="random",
22 |                                                     typical_value = -5,
23 |                                                     exploring_starts=False,
24 |                                                     is_state_done=lambda state: state == 0,
25 |                                                     verbose=0,
26 |                                                     )
27 | print("Estimated state values :", estimated_state_values)
28 | 
29 | print("\nEstimated state values during the learning:")
30 | estimated_state_values_during_training = algo_TD.find_state_values_yielding(policy = policy_swim_randomly,
31 |                                                                             env = OceanEnv(),
32 |                                                                             n_episodes = 50,
33 |                                                                             n_steps = float("inf"),
34 |                                                                             gamma=0.99,
35 |                                                                             alpha=0.1,
36 |                                                                             timelimit=40,
37 |                                                                             initial_state_values="random",
38 |                                                                             typical_value = -5,
39 |                                                                             exploring_starts=False,
40 |                                                                             is_state_done=lambda state: state == 0,
41 | 
42 |                                                                             yield_frequency="episode",
43 |                                                                                 )
44 | for estimated_state_values in estimated_state_values_during_training:
45 |     print(estimated_state_values)
46 | 
47 | print("\nComputing action values for the policy policy_swim_randomly...")
48 | estimated_action_values = algo_SARSA.find_action_values(   policy = policy_swim_randomly,
49 |                                                         env = OceanEnv(),
50 |                                                         n_episodes = 100,
51 |                                                         n_steps = float("inf"),
52 |                                                         gamma=0.99,
53 |                                                         alpha=0.1,
54 |                                                         timelimit=40,
55 |                                                         initial_action_values="random",
56 |                                                         typical_value = -5,
57 |                                                         exploring_starts=False,
58 |                                                         is_state_done=lambda state: state == 0,
59 |                                                         verbose=0,
60 |                                                             )
61 | print("Estimated action values :", estimated_action_values)
62 | 
63 | print("\nEstimated action values during the learning:")
64 | estimated_action_values_during_training = algo_TD.find_action_values_yielding(  policy = policy_swim_randomly,
65 |                                                                                 env = OceanEnv(),
66 |                                                                                 n_episodes = 1,
67 |                                                                                 n_steps = 10,
68 |                                                                                 gamma=0.99,
69 |                                                                                 alpha=0.1,
70 |                                                                                 timelimit=40,
71 |                                                                                 initial_action_values="random",
72 |                                                                                 typical_value = -5,
73 |                                                                                 exploring_starts=False,
74 |                                                                                 is_state_done=lambda state: state == 0,
75 | 
76 |                                                                                 yield_frequency="step",
77 |                                                                                     )
78 | for estimated_action_values in estimated_action_values_during_training:
79 |     print(estimated_action_values)


--------------------------------------------------------------------------------
/MC/example_MC_prediction.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from src.utils import *
 4 | from environnements.oceanEnv import OceanEnv
 5 | from MC.monteCarlo import MonteCarlo
 6 | from src.policies import DiscretePolicyForDiscreteState
 7 | 
 8 | policy_swim_randomly = DiscretePolicyForDiscreteState(probs = np.array([[0.8, 0.2] for _ in range(11)]))
 9 | 
10 | algo_MC = MonteCarlo()
11 | 
12 | print("\nComputing state values for the policy policy_swim_randomly...")
13 | estimated_state_values = algo_MC.find_state_values( policy = policy_swim_randomly,
14 |                                                     env = OceanEnv(),
15 |                                                     n_episodes = 10,
16 |                                                     gamma=0.98,
17 |                                                     visit_method="first_visit",
18 |                                                     averaging_method="moving",
19 |                                                     alpha=0.1,
20 |                                                     timelimit=40,
21 |                                                     initial_state_values="random",
22 |                                                     typical_value = -5,
23 |                                                     exploring_starts=False,
24 |                                                     is_state_done=lambda state: state == 0,
25 |                                                     verbose=1,
26 |                                                     )
27 | print("Estimated state values :", estimated_state_values)
28 | 
29 | print("\nEstimated state values during the learning:")
30 | estimated_state_values_during_training = algo_MC.find_state_values_yielding(policy = policy_swim_randomly,
31 |                                                                             env = OceanEnv(),
32 |                                                                             n_episodes = 2,
33 |                                                                             gamma=0.98,
34 |                                                                             visit_method="first_visit",
35 |                                                                             averaging_method="moving",
36 |                                                                             alpha=0.1,
37 |                                                                             timelimit=40,
38 |                                                                             initial_state_values="random",
39 |                                                                             typical_value = -5,
40 |                                                                             exploring_starts=False,
41 |                                                                             is_state_done=lambda state: state == 0,
42 |                                                                                 )
43 | for estimated_state_values in estimated_state_values_during_training:
44 |     print(estimated_state_values)
45 | 
46 | print("\nComputing action values for the policy policy_swim_randomly...")
47 | estimated_action_values = algo_MC.find_action_values(   policy = policy_swim_randomly,
48 |                                                         env = OceanEnv(),
49 |                                                         n_episodes=10,
50 |                                                         gamma=0.98,
51 |                                                         visit_method="first_visit",
52 |                                                         averaging_method="moving",
53 |                                                         alpha=0.05,
54 |                                                         timelimit=40,
55 |                                                         initial_action_values="random",
56 |                                                         typical_value=-10,
57 |                                                         exploring_starts=False,
58 |                                                         is_state_done=lambda state: state == 0,
59 |                                                             )
60 | print("Estimated action values :", estimated_action_values)
61 | 
62 | print("\nEstimated action values during the learning:")
63 | estimated_action_values_during_training = algo_MC.find_action_values_yielding(  policy = policy_swim_randomly,
64 |                                                                                 env = OceanEnv(),
65 |                                                                                 n_episodes=2,
66 |                                                                                 gamma=0.98,
67 |                                                                                 visit_method="first_visit",
68 |                                                                                 averaging_method="moving",
69 |                                                                                 alpha=0.05,
70 |                                                                                 hotimelimitrizon=40,
71 |                                                                                 initial_action_values="random",
72 |                                                                                 typical_value=-10,
73 |                                                                                 exploring_starts=False,
74 |                                                                                 is_done_states=lambda state: state == 0,
75 |                                                                                     )
76 | for estimated_action_values in estimated_action_values_during_training:
77 |     print(estimated_action_values)


--------------------------------------------------------------------------------
/playground_app/playground.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import numpy as np
  3 | import pandas as pd
  4 | import plotly.express as px
  5 | import plotly.graph_objects as go
  6 | from src.utils import *
  7 | 
  8 | def run_rl_algorithm(**config):
  9 |     """Run an algorithm and display result on streamlit.
 10 |     """
 11 |     st.header("Results of training:")
 12 |     if config["family"] == "MC":
 13 |         config["yield_frequency"] = st.selectbox("Display a frame each... (higher frequency slow down displaying)", ["step", "episode", "iteration"], index=1)
 14 |     elif config["family"] == "DP":
 15 |         config["yield_frequency"] = st.selectbox("Display a frame each... (higher frequency slow down displaying)", ["step", "iteration", "global_iteration"], index=1)
 16 |     elif config["family"] == "TD":
 17 |         config["yield_frequency"] = st.selectbox("Display a frame each... (higher frequency slow down displaying)", ["step", "episode"], index=1)
 18 |     else:
 19 |         raise ValueError("Unknown family: {}".format(config["family"]))
 20 | 
 21 |     # Generate for the good problem with the good algo and for the specified config
 22 |     algo = config["algo"]
 23 |     problem = config["problem"]
 24 |     try:
 25 |         if problem == "Prediction Problem":
 26 |             values_type = config["values_type"]
 27 |             if values_type == "State values V":
 28 |                 datas = algo.find_state_values_yielding(**config)
 29 |             elif values_type == "Action values Q":
 30 |                 datas = algo.find_action_values_yielding(**config)
 31 |         elif problem == "Control Problem":
 32 |             datas = algo.find_optimal_policy_yielding(**config)
 33 |     except AttributeError:
 34 |         raise ValueError(f"Algorithm {config['algo_name']} does not work for finding the specified values (if Prediction Problem) or finding the optimal policy. Please change problem or values kind.")
 35 | 
 36 |     #Treat this data
 37 |     title = "Algo starting"
 38 |     title_control = ""
 39 |     title_prediction = ""
 40 | 
 41 |     num_frame = 0
 42 |     frame_titles = dict()
 43 |     datas_list = list()
 44 |     env = config["env"]
 45 |     n_states, n_actions = env.observation_space.n, env.action_space.n
 46 |     
 47 |     greedy_actions = None
 48 |     a, b = config["range_values"]
 49 |     y_greedy_actions = 0.9 * b + 0.1 * a
 50 |     for data in datas:
 51 |         # If the data is a string, modify the title of the next frames.
 52 |         if type(data) == str:
 53 |             if "Prediction" in data:
 54 |                 title_prediction = data
 55 |             elif "Control" in data:
 56 |                 title_control = data
 57 |             title = title_control + " | " + title_prediction
 58 | 
 59 |         # If the data is an array, it can either be a Q(s,a), V(s) or greedy_actions(s). We are building a new frame.
 60 |         elif type(data) == np.ndarray:
 61 |             # Save the title of the frame. We will apply this title later.
 62 |             frame_titles[num_frame] = title
 63 |             # Add plot of actions.
 64 |             if greedy_actions is not None:
 65 |                 for state in range(n_states):
 66 |                     datas_list.append([num_frame, state, greedy_actions[state], y_greedy_actions])
 67 |             # Add plot of Q values
 68 |             if len(data.shape) == 2:    # Q values
 69 |                 for state in range(n_states):
 70 |                     for action in range(n_actions):
 71 |                         datas_list.append([num_frame, state, action, data[state, action]])
 72 |             # Add plot of V values or update greedy_actions depending of the nature of the problem which define the type of 1-dimensionnaly shaped data returned (V or actions).
 73 |             elif len(data.shape) == 1:  # 
 74 |                 if problem == "Prediction Problem": #V values
 75 |                     for state in range(n_states):
 76 |                         datas_list.append([num_frame, state, -1, data[state]])
 77 |                 elif problem == "Control Problem": # greedy actions
 78 |                     greedy_actions = data
 79 | 
 80 |             else: 
 81 |                 raise ValueError("data must be either a string or a numpy array")
 82 |             num_frame += 1
 83 |     #Create df and plotly figure : we plot the value in function of the state, and the time-axis is defined as frame. We group data by action to distinguish Q(s,a) for different a.
 84 |     df = pd.DataFrame(datas_list, columns=["frame", "state", "action", "values"])
 85 |     range_states = [-1, env.observation_space.n]
 86 |     fig = px.scatter(df,    x = "state", 
 87 |                             y = "values", 
 88 |                             color = "action", # if values_type == "Action values Q" else None, 
 89 |                             animation_frame="frame",
 90 |                             range_x=range_states, range_y=config["range_values"])
 91 | 
 92 |     #This is for animated title for an animation (only way kekw)
 93 |     if len(fig.layout.updatemenus) == 0: raise ValueError("Likely cause of this error : The frequency for frame doesn't make sense for this algorithm, please change.")
 94 |     for button in fig.layout.updatemenus[0].buttons:
 95 |         button['args'][1]['frame']['redraw'] = True
 96 |     for k in range(len(fig.frames)):
 97 |         fig.frames[k]['layout'].update(title_text=frame_titles[k])
 98 |     
 99 |     
100 |     # fig.add_trace(go.Scatter(
101 |     # x=range_states,
102 |     # y=[y_greedy_actions] * len(range_states),
103 |     # name="",
104 |     # ))
105 | 
106 |     #Display the figure
107 |     if st.checkbox("Display training"):
108 |         st.plotly_chart(fig)
109 |         if greedy_actions is not None:
110 |             st.write(f"The points that stays at y={y_greedy_actions} represents the greedy action. They are those chosen by the agent in the case of a greedy policy.")


--------------------------------------------------------------------------------
/streamlit_app.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import numpy as np
  3 | 
  4 | from src.utils import *
  5 | from src.policies import DiscretePolicyForDiscreteState
  6 | 
  7 | from playground_app.playground import *
  8 | from playground_app.mappings import map_name_to_algo, map_name_to_env, map_problem_to_algo_names
  9 | 
 10 | st.title("Reinforcement Learning Playground")
 11 | config = {}
 12 | 
 13 | # Input 1 : env and problem type
 14 | st.sidebar.header("Problem")
 15 | env_name = st.sidebar.selectbox("Environment", map_name_to_env.keys())
 16 | st.sidebar.caption(map_name_to_env[env_name]["description"])
 17 | problem =  st.sidebar.selectbox("Problem", ["Prediction Problem", "Control Problem"])
 18 | 
 19 | env_dict = map_name_to_env[env_name]
 20 | Pssa, Rsa = env_dict["model"]
 21 | env = env_dict["Env"]()
 22 | env_description = env_dict["description"]
 23 | env_image_path = env_dict["image_path"]
 24 | config["env"] = env
 25 | config["transition_probability"] = Pssa
 26 | config["reward_probability"] = Rsa
 27 | config["range_values"] = env_dict["range_values"]
 28 | config["problem"] = problem
 29 | 
 30 | st.header(f"Environment : {env_name}")
 31 | st.caption(env_description)
 32 | st.image(env_image_path)
 33 | 
 34 | 
 35 | if problem == "Prediction Problem":
 36 |     # Input 2 : policy to evaluate, value type and algo
 37 |     st.header("Algorithm used")
 38 | 
 39 |     algo_name = st.selectbox("Algorithm", map_problem_to_algo_names["Prediction Problem"])
 40 |     Algo = map_name_to_algo[algo_name]["Algo"]
 41 | 
 42 |     values_type = st.selectbox("Values to estimate", ["State values V", "Action values Q"])
 43 | 
 44 |     n_actions = env.action_space.n
 45 |     action_probs = list()
 46 |     st.caption("Policy to evaluate: (will be normalized). This playground can only evaluate blind policy (non dependant on states).")
 47 |     for action in range(n_actions):
 48 |         action_probs.append(st.slider(f"Action {action}", 0, 100, value=50))
 49 |     action_probs = np.array(action_probs) / np.sum(action_probs)
 50 |     probs = np.array([action_probs for _ in range(env.observation_space.n)])
 51 |     policy = DiscretePolicyForDiscreteState(probs = probs)
 52 | 
 53 |     
 54 | 
 55 |     config["policy"] = policy
 56 |     config["algo_name"] = algo_name
 57 |     config["algo"] = Algo()
 58 |     config["family"] = map_name_to_algo[algo_name]["family"]
 59 |     config["values_type"] = values_type
 60 | 
 61 | elif problem == "Control Problem":
 62 |     # Input 2 : algo
 63 |     st.header("Algorithm used")
 64 | 
 65 |     algo_name = st.selectbox("Algorithm", map_problem_to_algo_names["Control Problem"])
 66 |     Algo = map_name_to_algo[algo_name]["Algo"]
 67 | 
 68 |     config["algo_name"] = algo_name
 69 |     config["algo"] = Algo()
 70 |     config["family"] = map_name_to_algo[algo_name]["family"]
 71 | 
 72 | 
 73 | # Input 3 : Problem-related parameters
 74 | st.header("Hyperparameters")
 75 | col_problem, col_algo = st.columns(2)
 76 | with col_problem:
 77 |     if problem == "Prediction Problem":
 78 |         st.subheader("Prediction problem:")
 79 |         if map_name_to_algo[algo_name]["family"] == "MC":   # n_episode
 80 |             config["n_episodes"] = st.number_input("Number of episodes", value=20)
 81 |             config["exploring_starts"] = st.checkbox("Exploring starts", value=False)   # exploring_starts
 82 |             if config["exploring_starts"]: config["is_state_done"] = map_name_to_env[env_name]["is_state_done"]
 83 |         elif map_name_to_algo[algo_name]["family"] == "TD":
 84 |             pass
 85 |         elif map_name_to_algo[algo_name]["family"] == "DP":         # n_iterations
 86 |             config["n_iterations"] = st.number_input("Number of iterations", value=20)
 87 | 
 88 |     elif problem == "Control Problem":
 89 |         st.subheader("Control problem:")
 90 |         if map_name_to_algo[algo_name]["family"] == "MC":
 91 |             config["n_iterations"] = st.number_input("Number of iterations", value=10)
 92 |             config["evaluation_episodes"] = st.number_input("Number of episodes at each evaluation of the policy", value=50)
 93 |         if map_name_to_algo[algo_name]["family"] == "TD":
 94 |             pass
 95 |         if map_name_to_algo[algo_name]["family"] == "DP":
 96 |             config["n_iterations"] = st.number_input("Number of iterations", value=10)
 97 | # Input 4 : Algorithm-related parameters
 98 | with col_algo:
 99 |     if map_name_to_algo[algo_name]["family"] == "MC":
100 |         st.subheader("Monte Carlo:")
101 |         config["visit_method"] = st.selectbox("Visit method", ["first_visit"])
102 |         config["averaging_method"] = st.selectbox("Averaging method", ["cumulative", "moving"])
103 |         config["alpha"] = st.slider("Learning rate", 0.0, 1.0, value=0.1)
104 |         if problem == "Prediction Problem":
105 |             pass
106 |         elif problem == "Control Problem":
107 |             config["exploration_method"] = st.selectbox("Exploration method", ["epsilon_greedy", "greedy", "exploring_starts"])
108 |             if config["exploration_method"] == "epsilon_greedy":
109 |                 config["epsilon"] = st.slider("Epsilon", 0., 1., value=0.1)
110 | 
111 |     if map_name_to_algo[algo_name]["family"] == "TD":
112 |         st.subheader("TD Learning:")
113 |         config["n_episodes"] = st.number_input("Maximal duration in episodes", value=20)
114 |         config["n_steps"] = st.number_input("Maximal duration in steps", value=30*20)
115 |         config["alpha"] = st.slider("Learning rate", 0.0, 1.0, value=0.1)
116 | 
117 |     if map_name_to_algo[algo_name]["family"] == "DP":
118 |         st.subheader("Dynamic Programming:")
119 |         st.write("Criterium for convergence of DP algorithms:")
120 |         maximal_error = st.number_input("Error threshold for convergence", value=0.01)
121 |         config["maximal_error"] = maximal_error
122 |         config["IPE_maximal_error"] = maximal_error
123 |         config["sweep_order"] = st.selectbox("Sweep order for states", ["normal", "reverse", "random"])
124 |         if problem == "Prediction Problem":
125 |             pass
126 |         elif problem == "Control Problem":
127 |             config["IPE_n_iterations"] = st.number_input("Number of iterations for the IPE algorithm", value=20)
128 | 
129 | # Input 4 : Hyperparameters
130 | with st.sidebar:
131 |     st.header("Environnement hyperparameters")
132 |     config["gamma"] = st.number_input("Discount factor", value=0.95)
133 |     config["timelimit"] = st.slider("Time limit", 0, 100, value=40)
134 |     initial_values = st.selectbox("Initial values", ["zeros", "random", "optimistic"])
135 |     config["initial_state_values"] = initial_values
136 |     config["initial_action_values"] = initial_values
137 |     config["typical_value"] = st.number_input("Typical value (in magnitude order)", value=1)
138 | 
139 | # Output : compute values and display
140 | run_rl_algorithm(**config)


--------------------------------------------------------------------------------
/DP/plot_control_figures.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from matplotlib.animation import FuncAnimation
  4 | 
  5 | from src.utils import *
  6 | from environnements.oceanEnv import transition_probability_ocean, reward_probability_ocean
  7 | from DP.dynamicProgramming import PolicyIteration, ValueIteration
  8 | 
  9 | n_iterations = 10
 10 | S = np.arange(0,11)
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | ### ====================================================================================================================== ###
 20 | ### ============================================ Policy Iteration ======================================================== ###  
 21 | ### ====================================================================================================================== ###
 22 | 
 23 | algo_PI = PolicyIteration()
 24 | 
 25 | ### Plot the state values estimated through training
 26 | src.policies_and_actions = algo_PI.find_optimal_policy_yielding(transition_probability=transition_probability_ocean,
 27 |                                                             reward_probability=reward_probability_ocean,
 28 |                                                             gamma=.98,
 29 |                                                             n_iterations=8,
 30 |                                                             IPE_n_iterations=5,
 31 |                                                             IPE_threshold=.05,
 32 |                                                             sweep_order="random",
 33 |                                                             initial_action_values="random",
 34 |                                                             typical_value=-1,
 35 |                                                             yield_frequency="step",
 36 |                                                               )
 37 | 
 38 | 
 39 | results = [e.copy() if type(e) == np.ndarray else e for e in src.policies_and_actions]
 40 | 
 41 | bact = 4                                                                   
 42 | fig, ax = plt.subplots()
 43 | ax.set_xlim(-1, 11)
 44 | ax.set_ylim(-20, bact + 2)
 45 | ax.set_xlabel("s")
 46 | title_control = f"DP Control (PI or VI) - Iteration 0"
 47 | title_prediction = f"DP Prediction of Q (IPE) - Iteration 0"
 48 | 
 49 | # actions, = ax.plot(S, results[1] + bact, ".b", label = "Actions")
 50 | actions_join, =ax.plot(S[results[0] == 0], [bact] * (len(S)-np.sum(results[0])), "<g", label = "Actions")
 51 | actions_leave, =ax.plot(S[results[0] == 1], [bact] * np.sum(results[0]), ">r")
 52 | qvalues_closer, = ax.plot(S, results[1][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach")
 53 | qvalues_far,    = ax.plot(S, results[1][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach")
 54 | ax.legend()
 55 | 
 56 | def update(n):
 57 |     global title_control, title_prediction
 58 |     if n>= len(results):
 59 |         ax.set_title("Policy Iteration (ended)")
 60 |         return
 61 |     data = results[n]
 62 |     if type(data) == str:
 63 |         if "Control" in data:
 64 |             title_control = data
 65 |             ax.set_title(title_control + " | " + title_prediction)
 66 |         elif "Prediction" in data:
 67 |             title_prediction = data
 68 |             ax.set_title(title_control + " | " + title_prediction)
 69 |     elif type(data) == np.ndarray:
 70 |         if len(data.shape) == 1:
 71 |             actions_join.set_data(S[data == 0], [bact] * (len(S)-np.sum(data)))
 72 |             actions_leave.set_data(S[data == 1], [bact] * np.sum(data))
 73 |         elif len(data.shape) == 2:
 74 |             qvalues_closer.set_ydata(data[:, 0])
 75 |             qvalues_far.set_ydata(data[:, 1])
 76 | 
 77 | anim = FuncAnimation(   fig = fig,
 78 |                         func = update,
 79 |                         repeat = True,
 80 |                         frames = np.arange(2, len(results)),
 81 |                         interval = 100)
 82 | 
 83 | plt.show()
 84 | anim.save("figure/DP/policy_iteration.gif", writer = "ffmpeg", fps = 30)
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | ### ====================================================================================================================== ###
 92 | ### ============================================ Value Iteration ========================================================= ###  
 93 | ### ====================================================================================================================== ###
 94 | 
 95 | algo_VI = ValueIteration()
 96 | 
 97 | ### Plot the state values estimated through training
 98 | src.policies_and_actions = algo_VI.find_optimal_policy_yielding(transition_probability=transition_probability_ocean,
 99 |                                                             reward_probability=reward_probability_ocean,
100 |                                                             gamma=.98,
101 |                                                             n_iterations=15,
102 |                                                             sweep_order="random",
103 |                                                             initial_action_values="random",
104 |                                                             typical_value=-1,
105 |                                                             yield_frequency="step",
106 |                                                               )
107 | 
108 | 
109 | results = [e.copy() if type(e) == np.ndarray else e for e in src.policies_and_actions]
110 | 
111 | bact = 4                                                                   
112 | fig, ax = plt.subplots()
113 | ax.set_xlim(-1, 11)
114 | ax.set_ylim(-20, bact + 2)
115 | ax.set_xlabel("s")
116 | title_control = f"DP Control (PI or VI) - Iteration 0"
117 | title_prediction = f"DP Prediction of Q (IPE) - Iteration 0"
118 | 
119 | # actions, = ax.plot(S, results[1] + bact, ".b", label = "Actions")
120 | actions_join, =ax.plot(S[results[0] == 0], [bact] * (len(S)-np.sum(results[0])), "<g", label = "Actions")
121 | actions_leave, =ax.plot(S[results[0] == 1], [bact] * np.sum(results[0]), ">r")
122 | qvalues_closer, = ax.plot(S, results[1][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach")
123 | qvalues_far,    = ax.plot(S, results[1][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach")
124 | ax.legend()
125 | 
126 | def update(n):
127 |     global title_control, title_prediction
128 |     if n>= len(results):
129 |         ax.set_title("Value Iteration (ended)")
130 |         return
131 |     data = results[n]
132 |     if type(data) == str:
133 |         if "Control" in data:
134 |             title_control = data
135 |             ax.set_title(title_control + " | " + title_prediction)
136 |         elif "Prediction" in data:
137 |             title_prediction = data
138 |             ax.set_title(title_control + " | " + title_prediction)
139 |     elif type(data) == np.ndarray:
140 |         if len(data.shape) == 1:
141 |             actions_join.set_data(S[data == 0], [bact] * (len(S)-np.sum(data)))
142 |             actions_leave.set_data(S[data == 1], [bact] * np.sum(data))
143 |         elif len(data.shape) == 2:
144 |             qvalues_closer.set_ydata(data[:, 0])
145 |             qvalues_far.set_ydata(data[:, 1])
146 | 
147 | anim = FuncAnimation(   fig = fig,
148 |                         func = update,
149 |                         repeat = True,
150 |                         frames = np.arange(2, len(results)),
151 |                         interval = 100)
152 | 
153 | anim.save("figure/DP/value_iteration.gif", writer = "ffmpeg", fps = 30)
154 | plt.show()
155 | 


--------------------------------------------------------------------------------
/DP/plot_prediction_figures.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from matplotlib.animation import FuncAnimation
  4 | 
  5 | from src.utils import *
  6 | from environnements.oceanEnv import transition_probability_ocean, reward_probability_ocean
  7 | from DP.dynamicProgramming import IterativePolicyEvaluation
  8 | from src.policies import DiscretePolicyForDiscreteState
  9 | 
 10 | 
 11 | algo_IPE = IterativePolicyEvaluation()
 12 | 
 13 | n_iterations = 15
 14 | S = np.arange(0,11)
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | ### ====================================================================================================================== ###
 24 | policy_join_beach = DiscretePolicyForDiscreteState(probs = np.array([[1, 0] for _ in range(11)]))
 25 | ### ====================================================================================================================== ###
 26 | 
 27 | 
 28 | ### Plot the state values estimated through training
 29 | estimated_state_values_during_training = algo_IPE.find_state_values_yielding(   policy = policy_join_beach,
 30 |                                                                                 transition_probability = transition_probability_ocean,
 31 |                                                                                 reward_probability = reward_probability_ocean,
 32 |                                                                                 n_iterations = n_iterations,
 33 |                                                                                 maximal_error = 0.01,
 34 |                                                                                 gamma=0.98,
 35 |                                                                                 sweep_order="random",)
 36 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training]
 37 |                                                                          
 38 | 
 39 | fig, ax = plt.subplots()
 40 | ax.set_xlim(-1, 11)
 41 | ax.set_ylim(-n_iterations-2, 1)
 42 | ax.set_xlabel("s")
 43 | ax.set_ylabel("V(s)")
 44 | ax.set_title(f"Policy join_beach : Iteration 0")
 45 | 
 46 | 
 47 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values")
 48 | line, = ax.plot(S, -S, "-r", label="True State Values (-s)")
 49 | ax.legend()
 50 | 
 51 | def update(n):
 52 |     data = VS[n]
 53 |     if type(data) == str:
 54 |         ax.set_title(f"Policy join_beach : {data}")
 55 |     elif type(data) == np.ndarray:
 56 |         points.set_ydata(VS[n])
 57 | 
 58 | anim = FuncAnimation(   fig = fig,
 59 |                         func = update,
 60 |                         repeat = True,
 61 |                         frames = np.arange(0, len(VS)),
 62 |                         interval = 100)
 63 | 
 64 | anim.save("figure/DP/v_values_joinBeach_estimated.gif", writer = "ffmpeg", fps = 2)
 65 | plt.show()
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | ### Plot the action values estimated through training
 72 | estimated_action_values_during_training = algo_IPE.find_action_values_yielding( policy = policy_join_beach,
 73 |                                                                                 transition_probability = transition_probability_ocean,
 74 |                                                                                 reward_probability = reward_probability_ocean,
 75 |                                                                                 n_iterations = n_iterations,
 76 |                                                                                 maximal_error = 0.01,
 77 |                                                                                 gamma = 0.98)
 78 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training]
 79 |                                                                          
 80 | 
 81 | fig, ax = plt.subplots()
 82 | ax.set_xlim(-1, 11)
 83 | ax.set_ylim(-n_iterations-2, 1)
 84 | ax.set_xlabel("s")
 85 | ax.set_ylabel("Q(s, a)")
 86 | ax.set_title(f"Policy join_beach : Iteration 0")
 87 | 
 88 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach")
 89 | points_get_far, =    ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach")
 90 | ax.legend()
 91 | 
 92 | def update(n):
 93 |     data = QSA[n]
 94 |     if type(data) == str:
 95 |         ax.set_title(f"Policy join_beach : {data}")
 96 |     elif type(data) == np.ndarray:
 97 |         points_get_closer.set_ydata(QSA[n][:, 0])
 98 |         points_get_far.set_ydata(QSA[n][:, 1])
 99 | 
100 | anim = FuncAnimation(   fig = fig,
101 |                         func = update,
102 |                         repeat = True,
103 |                         frames = np.arange(0, len(QSA)),
104 |                         interval = 100)
105 | 
106 | anim.save("figure/DP/q_values_joinBeach_estimated.gif", writer = "ffmpeg", fps = 2)
107 | plt.show()
108 | 
109 | 
110 | 
111 | 
112 | ### ====================================================================================================================== ###
113 | policy_leave_beach = DiscretePolicyForDiscreteState(probs = np.array([[0, 1] for _ in range(11)]))
114 | ### ====================================================================================================================== ###
115 | 
116 | ### Plot the state values estimated through training
117 | estimated_state_values_during_training = algo_IPE.find_state_values_yielding(   policy = policy_leave_beach,
118 |                                                                                 transition_probability = transition_probability_ocean,
119 |                                                                                 reward_probability = reward_probability_ocean,
120 |                                                                                 n_iterations = n_iterations,
121 |                                                                                 maximal_error = 0.01,
122 |                                                                                 gamma=0.8)
123 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training]
124 |                                                                          
125 | 
126 | fig, ax = plt.subplots()
127 | ax.set_xlim(-1, 11)
128 | ax.set_ylim(-n_iterations-2, 1)
129 | ax.set_xlabel("s")
130 | ax.set_ylabel("V(s)")
131 | ax.set_title(f"Policy leave_beach : Iteration 0")
132 | 
133 | 
134 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values")
135 | ax.legend()
136 | 
137 | def update(n):
138 |     data = VS[n]
139 |     if type(data) == str:
140 |         ax.set_title(f"Policy leave_beach : {data}")
141 |     elif type(data) == np.ndarray:
142 |         points.set_ydata(VS[n])
143 | 
144 | anim = FuncAnimation(   fig = fig,
145 |                         func = update,
146 |                         repeat = True,
147 |                         frames = np.arange(0, len(VS)),
148 |                         interval = 100)
149 | 
150 | anim.save("figure/DP/v_values_leaveBeach_estimated.gif", writer = "ffmpeg", fps = 2)
151 | plt.show()
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | ### Plot the action values estimated through training
159 | estimated_action_values_during_training = algo_IPE.find_action_values_yielding( policy = policy_leave_beach,
160 |                                                                                 transition_probability = transition_probability_ocean,
161 |                                                                                 reward_probability = reward_probability_ocean,
162 |                                                                                 n_iterations = n_iterations,
163 |                                                                                 maximal_error = 0.01,
164 |                                                                                 gamma = 0.8)
165 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training]                                                                         
166 | 
167 | fig, ax = plt.subplots()
168 | ax.set_xlim(-1, 11)
169 | ax.set_ylim(-n_iterations-2, 1)
170 | ax.set_xlabel("s")
171 | ax.set_ylabel("Q(s, a)")
172 | ax.set_title(f"Policy leave_beach : Iteration 0")
173 | 
174 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach")
175 | points_get_far, =    ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach")
176 | ax.legend()
177 | 
178 | def update(n):
179 |     data = QSA[n]
180 |     if type(data) == str:
181 |         ax.set_title(f"Policy leave_beach : {data}")
182 |     elif type(data) == np.ndarray:
183 |         points_get_closer.set_ydata(QSA[n][:, 0])
184 |         points_get_far.set_ydata(QSA[n][:, 1])
185 | 
186 | anim = FuncAnimation(   fig = fig,
187 |                         func = update,
188 |                         repeat = True,
189 |                         frames = np.arange(0, len(QSA)),
190 |                         interval = 100)
191 | 
192 | anim.save("figure/DP/q_values_leaveBeach_estimated.gif", writer = "ffmpeg", fps = 2)
193 | plt.show()
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | ### ====================================================================================================================== ###
202 | policy_swim_randomly = DiscretePolicyForDiscreteState(probs = np.array([[0.8, 0.2] for _ in range(11)]))
203 | ### ====================================================================================================================== ###
204 | 
205 | ### Plot the state values estimated through training
206 | estimated_state_values_during_training = algo_IPE.find_state_values_yielding(   policy = policy_swim_randomly,
207 |                                                                                 transition_probability = transition_probability_ocean,
208 |                                                                                 reward_probability = reward_probability_ocean,
209 |                                                                                 n_iterations = n_iterations,
210 |                                                                                 maximal_error = 0.01,
211 |                                                                                 gamma=0.98)
212 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training]
213 |                                                                          
214 | 
215 | fig, ax = plt.subplots()
216 | ax.set_xlim(-1, 11)
217 | ax.set_ylim(-n_iterations-2, 1)
218 | ax.set_xlabel("s")
219 | ax.set_ylabel("V(s)")
220 | ax.set_title(f"Policy swim_randomly : Iteration 0")
221 | 
222 | 
223 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values")
224 | ax.legend()
225 | 
226 | def update(n):
227 |     data = VS[n]
228 |     if type(data) == str:
229 |         ax.set_title(f"Policy swim_randomly : {data}")
230 |     elif type(data) == np.ndarray:
231 |         points.set_ydata(VS[n])
232 | 
233 | anim = FuncAnimation(   fig = fig,
234 |                         func = update,
235 |                         repeat = True,
236 |                         frames = np.arange(0, len(VS)),
237 |                         interval = 100)
238 | 
239 | anim.save("figure/DP/v_values_swim_randomly_estimated.gif", writer = "ffmpeg", fps = 2)
240 | plt.show()
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | ### Plot the action values estimated through training
248 | estimated_action_values_during_training = algo_IPE.find_action_values_yielding( policy = policy_swim_randomly,
249 |                                                                                 transition_probability = transition_probability_ocean,
250 |                                                                                 reward_probability = reward_probability_ocean,
251 |                                                                                 n_iterations = n_iterations,
252 |                                                                                 maximal_error = 0.01,
253 |                                                                                 gamma = 0.98)
254 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training]                                                                         
255 | 
256 | fig, ax = plt.subplots()
257 | ax.set_xlim(-1, 11)
258 | ax.set_ylim(-n_iterations-2, 1)
259 | ax.set_xlabel("s")
260 | ax.set_ylabel("Q(s, a)")
261 | ax.set_title(f"Policy swim_randomly : Iteration 0")
262 | 
263 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach")
264 | points_get_far, =    ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach")
265 | ax.legend()
266 | 
267 | def update(n):
268 |     data = QSA[n]
269 |     if type(data) == str:
270 |         ax.set_title(f"Policy swim_randomly : {data}")
271 |     elif type(data) == np.ndarray:
272 |         points_get_closer.set_ydata(QSA[n][:, 0])
273 |         points_get_far.set_ydata(QSA[n][:, 1])
274 | 
275 | anim = FuncAnimation(   fig = fig,
276 |                         func = update,
277 |                         repeat = True,
278 |                         frames = np.arange(0, len(QSA)),
279 |                         interval = 100)
280 | 
281 | anim.save("figure/DP/q_values_swim_randomly_estimated.gif", writer = "ffmpeg", fps = 2)
282 | plt.show()


--------------------------------------------------------------------------------
/MC/plot_prediction_figures.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from matplotlib.animation import FuncAnimation
  4 | 
  5 | from src.utils import *
  6 | from environnements.oceanEnv import OceanEnv
  7 | from MC.monteCarlo import MonteCarlo
  8 | from src.policies import DiscretePolicyForDiscreteState
  9 | 
 10 | 
 11 | algo_MC = MonteCarlo()
 12 | 
 13 | n_episodes = 10
 14 | S = np.arange(0,11)
 15 | y_low_lim = -20
 16 | fps = 30
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | ### ====================================================================================================================== ###
 24 | policy_join_beach = DiscretePolicyForDiscreteState(probs = np.array([[1, 0] for _ in range(11)]))
 25 | ### ====================================================================================================================== ###
 26 | 
 27 | 
 28 | ### Plot the state values estimated through training
 29 | estimated_state_values_during_training = algo_MC.find_state_values_yielding(policy = policy_join_beach,
 30 |                                                                             env = OceanEnv(),
 31 |                                                                             n_episodes = 3 * n_episodes,
 32 |                                                                             gamma=0.98,
 33 |                                                                             visit_method="first_visit",
 34 |                                                                             averaging_method="moving",
 35 |                                                                             alpha=0.1,
 36 |                                                                             timelimit=40,
 37 |                                                                             initial_state_values="random",
 38 |                                                                             typical_value = -5,
 39 |                                                                             exploring_starts=False,
 40 |                                                                             is_state_done=lambda state: state == 0,
 41 |                                                                             )
 42 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training]
 43 |                                                                          
 44 | 
 45 | fig, ax = plt.subplots()
 46 | ax.set_xlim(-1, 11)
 47 | ax.set_ylim(-13, 1)
 48 | ax.set_xlabel("s")
 49 | ax.set_ylabel("V(s)")
 50 | 
 51 | 
 52 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values")
 53 | line, = ax.plot(S, -S, "-r", label="True State Values (-s)")
 54 | ax.legend()
 55 | 
 56 | def update(n):
 57 |     data = VS[n]
 58 |     if type(data) == str:
 59 |         ax.set_title(f"Policy join_beach : {data}")
 60 |     elif type(data) == np.ndarray:
 61 |         points.set_ydata(VS[n])
 62 | 
 63 | anim = FuncAnimation(   fig = fig,
 64 |                         func = update,
 65 |                         repeat = True,
 66 |                         frames = np.arange(0, len(VS)),
 67 |                         interval = 30)
 68 | 
 69 | anim.save("figure/MC/v_values_joinBeach_estimated.gif", writer = "ffmpeg", fps = fps)
 70 | plt.show()
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | ### Plot the action values estimated through training
 77 | estimated_action_values_during_training = algo_MC.find_action_values_yielding(  policy = policy_join_beach,
 78 |                                                                                 env = OceanEnv(),
 79 |                                                                                 n_episodes = 3 * n_episodes,
 80 |                                                                                 gamma=0.98,
 81 |                                                                                 visit_method="first_visit",
 82 |                                                                                 averaging_method="moving",
 83 |                                                                                 alpha=0.1,
 84 |                                                                                 timelimit=40,
 85 |                                                                                 initial_action_values="random",
 86 |                                                                                 typical_value = -5,
 87 |                                                                                 exploring_starts=False,
 88 |                                                                                 is_state_done=lambda state: state == 0,
 89 |                                                                                 )
 90 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training]                                                                         
 91 |                                                                          
 92 | 
 93 | fig, ax = plt.subplots()
 94 | ax.set_xlim(-1, 11)
 95 | ax.set_ylim(-13, 1)
 96 | ax.set_xlabel("s")
 97 | ax.set_ylabel("Q(s, a)")
 98 | 
 99 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach")
100 | points_get_far, =    ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach")
101 | ax.legend()
102 | 
103 | def update(n):
104 |     data = QSA[n]
105 |     if type(data) == str:
106 |         ax.set_title(f"Policy join_beach : {data}")
107 |     elif type(data) == np.ndarray:
108 |         points_get_closer.set_ydata(QSA[n][:, 0])
109 |         points_get_far.set_ydata(QSA[n][:, 1])
110 | 
111 | 
112 | anim = FuncAnimation(   fig = fig,
113 |                         func = update,
114 |                         repeat = True,
115 |                         frames = np.arange(0, len(QSA)),
116 |                         interval = 30)
117 | 
118 | anim.save("figure/MC/q_values_joinBeach_estimated.gif", writer = "ffmpeg", fps = fps)
119 | plt.show()
120 | 
121 | 
122 | 
123 | 
124 | ### ====================================================================================================================== ###
125 | policy_leave_beach = DiscretePolicyForDiscreteState(probs = np.array([[0, 1] for _ in range(11)]))
126 | ### ====================================================================================================================== ###
127 | 
128 | ### Plot the state values estimated through training
129 | estimated_state_values_during_training = algo_MC.find_state_values_yielding(policy = policy_leave_beach,
130 |                                                                             env = OceanEnv(),
131 |                                                                             n_episodes = 2  * n_episodes,
132 |                                                                             gamma=0.8,
133 |                                                                             visit_method="first_visit",
134 |                                                                             averaging_method="moving",
135 |                                                                             alpha=0.1,
136 |                                                                             timelimit=40,
137 |                                                                             initial_state_values="random",
138 |                                                                             typical_value = -5,
139 |                                                                             exploring_starts=False,
140 |                                                                             is_state_done=lambda state: state == 0,
141 |                                                                             )
142 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training]
143 |                                                                          
144 | 
145 | fig, ax = plt.subplots()
146 | ax.set_xlim(-1, 11)
147 | ax.set_ylim(-13, 1)
148 | ax.set_xlabel("s")
149 | ax.set_ylabel("V(s)")
150 | 
151 | 
152 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values")
153 | ax.legend()
154 | 
155 | def update(n):
156 |     data = VS[n]
157 |     if type(data) == str:
158 |         ax.set_title(f"Policy leave_beach : {data}")
159 |     elif type(data) == np.ndarray:
160 |         points.set_ydata(VS[n])
161 | 
162 | anim = FuncAnimation(   fig = fig,
163 |                         func = update,
164 |                         repeat = True,
165 |                         frames = np.arange(0, len(VS)),
166 |                         interval = 30)
167 | 
168 | anim.save("figure/MC/v_values_leaveBeach_estimated.gif", writer = "ffmpeg", fps = fps)
169 | plt.show()
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | ### Plot the action values estimated through training
177 | estimated_action_values_during_training = algo_MC.find_action_values_yielding(  policy = policy_leave_beach,
178 |                                                                                 env = OceanEnv(),
179 |                                                                                 n_episodes = 2 * n_episodes,
180 |                                                                                 gamma=0.8,
181 |                                                                                 visit_method="first_visit",
182 |                                                                                 averaging_method="moving",
183 |                                                                                 alpha=0.1,
184 |                                                                                 timelimit=40,
185 |                                                                                 initial_action_values="random",
186 |                                                                                 typical_value = -5,
187 |                                                                                 exploring_starts=False,
188 |                                                                                 is_state_done=lambda state: state == 0,
189 |                                                                                 )
190 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training]                                                                         
191 |                                                                          
192 | 
193 | fig, ax = plt.subplots()
194 | ax.set_xlim(-1, 11)
195 | ax.set_ylim(-13, 1)
196 | ax.set_xlabel("s")
197 | ax.set_ylabel("Q(s, a)")
198 | ax.set_title(f"Policy leave_beach : Iteration 0/{n_episodes}")
199 | 
200 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach")
201 | points_get_far, =    ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach")
202 | ax.legend()
203 | 
204 | def update(n):
205 |     data = QSA[n]
206 |     if type(data) == str:
207 |         ax.set_title(f"Policy leave_beach : {data}")
208 |     elif type(data) == np.ndarray:
209 |         points_get_closer.set_ydata(QSA[n][:, 0])
210 |         points_get_far.set_ydata(QSA[n][:, 1])
211 | 
212 | 
213 | anim = FuncAnimation(   fig = fig,
214 |                         func = update,
215 |                         repeat = True,
216 |                         frames = np.arange(0, len(QSA)),
217 |                         interval = 100)
218 | 
219 | anim.save("figure/MC/q_values_leaveBeach_estimated.gif", writer = "ffmpeg", fps = fps)
220 | plt.show()
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | ### ====================================================================================================================== ###
229 | policy_swim_randomly = DiscretePolicyForDiscreteState(probs = np.array([[0.8, 0.2] for _ in range(11)]))
230 | ### ====================================================================================================================== ###
231 | 
232 | ### Plot the state values estimated through training
233 | estimated_state_values_during_training = algo_MC.find_state_values_yielding(policy = policy_swim_randomly,
234 |                                                                             env = OceanEnv(),
235 |                                                                             n_episodes = 3 * n_episodes,
236 |                                                                             gamma=0.98,
237 |                                                                             visit_method="first_visit",
238 |                                                                             averaging_method="moving",
239 |                                                                             alpha=0.1,
240 |                                                                             timelimit=40,
241 |                                                                             initial_state_values="random",
242 |                                                                             typical_value = -5,
243 |                                                                             exploring_starts=False,
244 |                                                                             is_state_done=lambda state: state == 0,
245 |                                                                             )
246 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training]
247 |                                                                          
248 | 
249 | fig, ax = plt.subplots()
250 | ax.set_xlim(-1, 11)
251 | ax.set_ylim(y_low_lim, 1)
252 | ax.set_xlabel("s")
253 | ax.set_ylabel("V(s)")
254 | 
255 | 
256 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values")
257 | ax.legend()
258 | 
259 | def update(n):
260 |     data = VS[n]
261 |     if type(data) == str:
262 |         ax.set_title(f"Policy swim_randomly : {data}")
263 |     elif type(data) == np.ndarray:
264 |         points.set_ydata(VS[n])
265 | 
266 | anim = FuncAnimation(   fig = fig,
267 |                         func = update,
268 |                         repeat = True,
269 |                         frames = np.arange(0, len(VS)),
270 |                         interval = 30)
271 | 
272 | anim.save("figure/MC/v_values_swim_randomly_estimated.gif", writer = "ffmpeg", fps = fps)
273 | plt.show()
274 | 
275 | 
276 | 
277 | 
278 | 
279 | 
280 | ### Plot the action values estimated through training
281 | estimated_action_values_during_training = algo_MC.find_action_values_yielding(  policy = policy_swim_randomly,
282 |                                                                                 env = OceanEnv(),
283 |                                                                                 n_episodes = 4 * n_episodes,
284 |                                                                                 gamma=0.98,
285 |                                                                                 visit_method="first_visit",
286 |                                                                                 averaging_method="moving",
287 |                                                                                 alpha=0.1,
288 |                                                                                 timelimit=40,
289 |                                                                                 initial_action_values="random",
290 |                                                                                 typical_value = -5,
291 |                                                                                 exploring_starts=False,
292 |                                                                                 is_state_done=lambda state: state == 0,
293 |                                                                                 )
294 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training]                                                                         
295 |                                                                          
296 | 
297 | fig, ax = plt.subplots()
298 | ax.set_xlim(-1, 11)
299 | ax.set_ylim(y_low_lim, 1)
300 | ax.set_xlabel("s")
301 | ax.set_ylabel("Q(s, a)")
302 | ax.set_title(f"Policy swim_randomly : Iteration 0/{n_episodes}")
303 | 
304 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach")
305 | points_get_far, =    ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach")
306 | ax.legend()
307 | 
308 | def update(n):
309 |     data = QSA[n]
310 |     if type(data) == str:
311 |         ax.set_title(f"Policy swim_randomly : {data}")
312 |     elif type(data) == np.ndarray:
313 |         points_get_closer.set_ydata(QSA[n][:, 0])
314 |         points_get_far.set_ydata(QSA[n][:, 1])
315 | 
316 | anim = FuncAnimation(   fig = fig,
317 |                         func = update,
318 |                         repeat = True,
319 |                         frames = np.arange(0, len(QSA)),
320 |                         interval = 100)
321 | 
322 | anim.save("figure/MC/q_values_swim_randomly_estimated.gif", writer = "ffmpeg", fps = fps)
323 | plt.show()


--------------------------------------------------------------------------------
/TD/plot_prediction_figures.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from matplotlib.animation import FuncAnimation
  4 | 
  5 | from src.utils import *
  6 | from environnements.oceanEnv import OceanEnv
  7 | from TD.TDLearning import TD, SARSA
  8 | from src.policies import DiscretePolicyForDiscreteState
  9 | 
 10 | 
 11 | algo_TD = TD()
 12 | algo_SARSA = SARSA()
 13 | 
 14 | n_episodes = 30
 15 | S = np.arange(0,11)
 16 | y_low_lim = -20
 17 | fps = 30
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | ### ====================================================================================================================== ###
 25 | policy_join_beach = DiscretePolicyForDiscreteState(probs = np.array([[1, 0] for _ in range(11)]))
 26 | ### ====================================================================================================================== ###
 27 | 
 28 | 
 29 | ### Plot the state values estimated through training
 30 | estimated_state_values_during_training = algo_TD.find_state_values_yielding(policy = policy_join_beach,
 31 |                                                                             env = OceanEnv(),
 32 |                                                                             n_episodes = n_episodes,
 33 |                                                                             n_steps = float("inf"),
 34 |                                                                             gamma=0.99,
 35 |                                                                             alpha=0.5,
 36 |                                                                             timelimit=40,
 37 |                                                                             initial_state_values="random",
 38 |                                                                             typical_value = -5,
 39 |                                                                             exploring_starts=False,
 40 |                                                                             is_state_done=lambda state: state == 0,
 41 | 
 42 |                                                                             yield_frequency="step",
 43 |                                                                                 )
 44 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training]
 45 |                                                                          
 46 | 
 47 | fig, ax = plt.subplots()
 48 | ax.set_xlim(-1, 11)
 49 | ax.set_ylim(-13, 1)
 50 | ax.set_xlabel("s")
 51 | ax.set_ylabel("V(s)")
 52 | 
 53 | 
 54 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values")
 55 | line, = ax.plot(S, -S, "-r", label="True State Values (-s)")
 56 | ax.legend()
 57 | 
 58 | def update(n):
 59 |     data = VS[n]
 60 |     if type(data) == str:
 61 |         ax.set_title(f"Policy join_beach : {data}")
 62 |     elif type(data) == np.ndarray:
 63 |         points.set_ydata(VS[n])
 64 | 
 65 | anim = FuncAnimation(   fig = fig,
 66 |                         func = update,
 67 |                         repeat = True,
 68 |                         frames = np.arange(0, len(VS)),
 69 |                         interval = 30)
 70 | 
 71 | anim.save("figure/TD/v_values_joinBeach_estimated.gif", writer = "ffmpeg", fps = fps)
 72 | plt.show()
 73 | 
 74 | 
 75 | 
 76 | 
 77 | ### Plot the action values estimated through training
 78 | estimated_action_values_during_training = algo_SARSA.find_action_values_yielding(  policy = policy_join_beach,
 79 |                                                                                 env = OceanEnv(),
 80 |                                                                                 n_episodes = n_episodes,
 81 |                                                                                 n_steps = float("inf"),
 82 |                                                                                 gamma=0.99,
 83 |                                                                                 alpha=0.5,
 84 |                                                                                 timelimit=40,
 85 |                                                                                 initial_action_values="random",
 86 |                                                                                 typical_value = -5,
 87 |                                                                                 exploring_starts=False,
 88 |                                                                                 is_state_done=lambda state: state == 0,
 89 | 
 90 |                                                                                 yield_frequency="step",
 91 |                                                                                     )
 92 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training]                                                                         
 93 |                                                                          
 94 | 
 95 | fig, ax = plt.subplots()
 96 | ax.set_xlim(-1, 11)
 97 | ax.set_ylim(-13, 1)
 98 | ax.set_xlabel("s")
 99 | ax.set_ylabel("Q(s, a)")
100 | 
101 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach")
102 | points_get_far, =    ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach")
103 | ax.legend()
104 | 
105 | def update(n):
106 |     data = QSA[n]
107 |     if type(data) == str:
108 |         ax.set_title(f"Policy join_beach : {data}")
109 |     elif type(data) == np.ndarray:
110 |         points_get_closer.set_ydata(QSA[n][:, 0])
111 |         points_get_far.set_ydata(QSA[n][:, 1])
112 | 
113 | 
114 | anim = FuncAnimation(   fig = fig,
115 |                         func = update,
116 |                         repeat = True,
117 |                         frames = np.arange(0, len(QSA)),
118 |                         interval = 30)
119 | 
120 | anim.save("figure/TD/q_values_joinBeach_estimated.gif", writer = "ffmpeg", fps = fps)
121 | plt.show()
122 | 
123 | 
124 | 
125 | 
126 | ### ====================================================================================================================== ###
127 | policy_leave_beach = DiscretePolicyForDiscreteState(probs = np.array([[0, 1] for _ in range(11)]))
128 | ### ====================================================================================================================== ###
129 | 
130 | ### Plot the state values estimated through training
131 | estimated_state_values_during_training = algo_TD.find_state_values_yielding(policy = policy_leave_beach,
132 |                                                                             env = OceanEnv(),
133 |                                                                             n_episodes = 5,
134 |                                                                             n_steps = float("inf"),
135 |                                                                             gamma=0.8,
136 |                                                                             alpha=0.5,
137 |                                                                             timelimit=40,
138 |                                                                             initial_state_values="random",
139 |                                                                             typical_value = -5,
140 |                                                                             exploring_starts=False,
141 |                                                                             is_state_done=lambda state: state == 0,
142 | 
143 |                                                                             yield_frequency="step",
144 |                                                                                 )
145 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training]
146 |                                                                          
147 | 
148 | fig, ax = plt.subplots()
149 | ax.set_xlim(-1, 11)
150 | ax.set_ylim(-13, 1)
151 | ax.set_xlabel("s")
152 | ax.set_ylabel("V(s)")
153 | 
154 | 
155 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values")
156 | ax.legend()
157 | 
158 | def update(n):
159 |     data = VS[n]
160 |     if type(data) == str:
161 |         ax.set_title(f"Policy leave_beach : {data}")
162 |     elif type(data) == np.ndarray:
163 |         points.set_ydata(VS[n])
164 | 
165 | anim = FuncAnimation(   fig = fig,
166 |                         func = update,
167 |                         repeat = True,
168 |                         frames = np.arange(0, len(VS)),
169 |                         interval = 30)
170 | 
171 | anim.save("figure/TD/v_values_leaveBeach_estimated.gif", writer = "ffmpeg", fps = fps)
172 | plt.show()
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | ### Plot the action values estimated through training
180 | estimated_action_values_during_training = algo_SARSA.find_action_values_yielding(  policy = policy_leave_beach,
181 |                                                                                 env = OceanEnv(),
182 |                                                                                 n_episodes = 5,
183 |                                                                                 n_steps = float("inf"),
184 |                                                                                 gamma=0.8,
185 |                                                                                 alpha=0.5,
186 |                                                                                 timelimit=40,
187 |                                                                                 initial_action_values="random",
188 |                                                                                 typical_value = -5,
189 |                                                                                 exploring_starts=False,
190 |                                                                                 is_state_done=lambda state: state == 0,
191 | 
192 |                                                                                 yield_frequency="step",
193 |                                                                                     )
194 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training]                                                                         
195 |                                                                          
196 | 
197 | fig, ax = plt.subplots()
198 | ax.set_xlim(-1, 11)
199 | ax.set_ylim(-13, 1)
200 | ax.set_xlabel("s")
201 | ax.set_ylabel("Q(s, a)")
202 | ax.set_title(f"Policy leave_beach : Iteration 0/{n_episodes}")
203 | 
204 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach")
205 | points_get_far, =    ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach")
206 | ax.legend()
207 | 
208 | def update(n):
209 |     data = QSA[n]
210 |     if type(data) == str:
211 |         ax.set_title(f"Policy leave_beach : {data}")
212 |     elif type(data) == np.ndarray:
213 |         points_get_closer.set_ydata(QSA[n][:, 0])
214 |         points_get_far.set_ydata(QSA[n][:, 1])
215 | 
216 | 
217 | anim = FuncAnimation(   fig = fig,
218 |                         func = update,
219 |                         repeat = True,
220 |                         frames = np.arange(0, len(QSA)),
221 |                         interval = 100)
222 | 
223 | anim.save("figure/TD/q_values_leaveBeach_estimated.gif", writer = "ffmpeg", fps = fps)
224 | plt.show()
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | ### ====================================================================================================================== ###
233 | policy_swim_randomly = DiscretePolicyForDiscreteState(probs = np.array([[0.8, 0.2] for _ in range(11)]))
234 | ### ====================================================================================================================== ###
235 | 
236 | ### Plot the state values estimated through training
237 | estimated_state_values_during_training = algo_TD.find_state_values_yielding(policy = policy_swim_randomly,
238 |                                                                             env = OceanEnv(),
239 |                                                                             n_episodes = n_episodes,
240 |                                                                             n_steps = float("inf"),
241 |                                                                             gamma=0.99,
242 |                                                                             alpha=0.5,
243 |                                                                             timelimit=40,
244 |                                                                             initial_state_values="random",
245 |                                                                             typical_value = -5,
246 |                                                                             exploring_starts=False,
247 |                                                                             is_state_done=lambda state: state == 0,
248 | 
249 |                                                                             yield_frequency="step",
250 |                                                                                 )
251 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training]
252 |                                                                          
253 | 
254 | fig, ax = plt.subplots()
255 | ax.set_xlim(-1, 11)
256 | ax.set_ylim(y_low_lim, 1)
257 | ax.set_xlabel("s")
258 | ax.set_ylabel("V(s)")
259 | 
260 | 
261 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values")
262 | ax.legend()
263 | 
264 | def update(n):
265 |     data = VS[n]
266 |     if type(data) == str:
267 |         ax.set_title(f"Policy swim_randomly : {data}")
268 |     elif type(data) == np.ndarray:
269 |         points.set_ydata(VS[n])
270 | 
271 | anim = FuncAnimation(   fig = fig,
272 |                         func = update,
273 |                         repeat = True,
274 |                         frames = np.arange(0, len(VS)),
275 |                         interval = 30)
276 | 
277 | anim.save("figure/TD/v_values_swim_randomly_estimated.gif", writer = "ffmpeg", fps = fps)
278 | plt.show()
279 | 
280 | 
281 | 
282 | 
283 | 
284 | 
285 | ### Plot the action values estimated through training
286 | estimated_action_values_during_training = algo_SARSA.find_action_values_yielding(  policy = policy_swim_randomly,
287 |                                                                                 env = OceanEnv(),
288 |                                                                                 n_episodes = n_episodes,
289 |                                                                                 n_steps = float("inf"),
290 |                                                                                 gamma=0.99,
291 |                                                                                 alpha=0.5,
292 |                                                                                 timelimit=40,
293 |                                                                                 initial_action_values="random",
294 |                                                                                 typical_value = -5,
295 |                                                                                 exploring_starts=False,
296 |                                                                                 is_state_done=lambda state: state == 0,
297 | 
298 |                                                                                 yield_frequency="step",
299 |                                                                                     )
300 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training]                                                                         
301 |                                                                          
302 | 
303 | fig, ax = plt.subplots()
304 | ax.set_xlim(-1, 11)
305 | ax.set_ylim(y_low_lim, 1)
306 | ax.set_xlabel("s")
307 | ax.set_ylabel("Q(s, a)")
308 | ax.set_title(f"Policy swim_randomly : Iteration 0/{n_episodes}")
309 | 
310 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach")
311 | points_get_far, =    ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach")
312 | ax.legend()
313 | 
314 | def update(n):
315 |     data = QSA[n]
316 |     if type(data) == str:
317 |         ax.set_title(f"Policy swim_randomly : {data}")
318 |     elif type(data) == np.ndarray:
319 |         points_get_closer.set_ydata(QSA[n][:, 0])
320 |         points_get_far.set_ydata(QSA[n][:, 1])
321 | 
322 | anim = FuncAnimation(   fig = fig,
323 |                         func = update,
324 |                         repeat = True,
325 |                         frames = np.arange(0, len(QSA)),
326 |                         interval = 100)
327 | 
328 | anim.save("figure/TD/q_values_swim_randomly_estimated.gif", writer = "ffmpeg", fps = fps)
329 | plt.show()


--------------------------------------------------------------------------------
/TD/TDLearning.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Iterator, Tuple, Union
  2 | import numpy as np
  3 | import gym
  4 | 
  5 | from src.policies import *
  6 | from src.utils import *
  7 | 
  8 | class TD:
  9 | 
 10 |     def find_state_values(self, policy : DiscretePolicyForDiscreteState,
 11 |                                 env : gym.Env,
 12 |                                 n_episodes : int = float("inf"),
 13 |                                 n_steps : int = float("inf"),
 14 |                                 gamma : float = 0.99,
 15 |                                 alpha : float = 0.1,
 16 |                                 timelimit : int = float("inf"),
 17 |                                 initial_state_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array
 18 |                                 typical_value : float = 1,
 19 |                                 exploring_starts : bool = False,
 20 |                                 is_state_done : Callable = None,
 21 |                                 verbose : int = 1,
 22 |                                     ) -> np.ndarray:
 23 |         """This method performs TD(0) for state values, an online on-policy TD Learning algorithm aiming to estimates the state value.
 24 |         The algorithm stop after a certain number of episodes or steps done.
 25 | 
 26 |         policy : the policy to evaluate
 27 |         env : the environment to evaluate the policy on
 28 |         n_episodes : the maximal number of episodes of interaction with the env to perform the algorithm
 29 |         n_steps : the maximal number of steps of interaction with the env to perform the algorithm
 30 |         gamma : the discount factor
 31 |         alpha : the learning rate
 32 |         timelimit : the number of maximal steps in an episode. After that the episode will be considered done. Use for non terminal env.
 33 |         initial_state_values : the initial values of the state values. Can be "random", "zeros", "optimistic" or a numpy array.
 34 |         typical_value : the typical value of the state values. Used to initialize the state values if initial_state_values is "random".
 35 |         exploring_starts : if True, the algorithm will start at a random-non terminal state. Use IF accessible env. Use for create minimum exploration in the case of deterministic src.policies.
 36 |         is_state_done : a function returning whether a state is terminal. Used if exploring_starts is True for no initialization in the terminal states
 37 |         verbose : the verbosity level. 0 for no output, 1 for output.
 38 |         """
 39 | 
 40 |         assert n_episodes != float("inf") or n_steps != float("inf"), "Either n_episodes or n_steps must be specified."
 41 | 
 42 |         if verbose >= 1 : 
 43 |             print(pretty_announcer(f"Start algorithm TD(0) for V.\nExploring starts : {exploring_starts}\nFor {n_episodes} episodes or {n_steps} steps."))
 44 | 
 45 |         # Initialize the state values
 46 |         state_values = initialize_values(   shape = (policy.n_states,),
 47 |                                             initial_values = initial_state_values,
 48 |                                             typical_value = typical_value)
 49 |         num_episode = 0
 50 |         num_total_step = 0
 51 | 
 52 |         while num_episode < n_episodes and num_total_step < n_steps:
 53 |             if verbose >= 1 : print(f"TD(0) Prediction of V - Episode {num_episode}/{n_episodes} - Step {num_total_step}/{n_steps}")
 54 |             # Initialize the state
 55 |             if exploring_starts:
 56 |                 state_temp = env.reset()
 57 |                 if not is_state_done(state_temp):
 58 |                     state = state_temp
 59 |                     env.state = state
 60 |                 else:
 61 |                     state = env.reset()
 62 |             else:
 63 |                 state = env.reset()
 64 | 
 65 |             # Loop through the episode
 66 |             t = 0
 67 |             done = False
 68 |             while not done and t < timelimit and num_total_step < n_steps:
 69 |                 # Take action, observe the next state and reward
 70 |                 action = np.random.choice(policy.n_actions, p=policy.probs[state])
 71 |                 next_state, reward, done, _ = env.step(action)
 72 |                 # Update the state values online                
 73 |                 state_values[state] += alpha * (reward + gamma * state_values[next_state] * (1-done) - state_values[state])             
 74 |                 # timelimit : we artificially set the episode as done if the timelimit is reached
 75 |                 if t >= timelimit: done = True
 76 |                 # If done, we additonally learn V(s_next) to be 0.
 77 |                 if done:
 78 |                     state_values[next_state] += alpha * (0 - state_values[next_state])
 79 | 
 80 |                 state = next_state
 81 |                 t += 1
 82 |                 num_total_step += 1
 83 |             
 84 |             num_episode += 1
 85 | 
 86 |         if verbose >= 1:
 87 |             print(f"TD(0) Prediction of V finished after {num_episode} episodes and {num_total_step} steps. State values found : {state_values}")
 88 | 
 89 |         return state_values
 90 |     
 91 | 
 92 | 
 93 |     def find_state_values_yielding(self,policy : DiscretePolicyForDiscreteState,
 94 |                                         env : gym.Env,
 95 |                                         n_episodes : int = float("inf"),
 96 |                                         n_steps : int = float("inf"),
 97 |                                         gamma : float = 0.99,
 98 |                                         alpha : float = 0.1,
 99 |                                         timelimit : int = float("inf"),
100 |                                         initial_state_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array
101 |                                         typical_value : float = 1,
102 |                                         exploring_starts : bool = False,
103 |                                         is_state_done : Callable = None,
104 |                                         yield_frequency : str = "step", # "iteration", "episode", "step"
105 |                                         **kwargs,
106 |                                             ) -> Iterator:
107 |         """
108 |         Same as find_state_values, but yields the state values at each step.
109 | 
110 |         yield_frequency : "step" or "episode" or "iteration", the frequency at which the state values are yielded.
111 |         """
112 | 
113 |         assert n_episodes != float("inf") or n_steps != float("inf"), "Either n_episodes or n_steps must be specified."
114 |         assert yield_frequency in ["step", "episode", "iteration"], "yield_frequency must be 'step', 'episode' or 'iteration'"
115 | 
116 |         # Initialize the state values
117 |         state_values = initialize_values(   shape = (policy.n_states,),
118 |                                             initial_values = initial_state_values,
119 |                                             typical_value = typical_value)
120 |         if yield_frequency != "iterations" : yield state_values
121 |         num_episode = 0
122 |         num_total_step = 0
123 | 
124 |         while num_episode < n_episodes and num_total_step < n_steps:
125 | 
126 |             if exploring_starts:
127 |                 state_temp = env.reset()
128 |                 if not is_state_done(state_temp):
129 |                     state = state_temp
130 |                     env.state = state
131 |                 else:
132 |                     state = env.reset()
133 |             else:
134 |                 state = env.reset()
135 | 
136 |             # Loop through the episode
137 |             t = 0
138 |             done = False
139 |             while not done and t < timelimit and num_total_step < n_steps:
140 |                 yield f"TD(0) Prediction of V - Episode {num_episode}/{n_episodes} - Step {num_total_step}/{n_steps}"
141 |                 # Take action, observe the next state and reward
142 |                 action = np.random.choice(policy.n_actions, p=policy.probs[state])
143 |                 next_state, reward, done, _ = env.step(action)
144 |                 # Update the state values online
145 |                 state_values[state] += alpha * (reward + gamma * state_values[next_state] * (1-done) - state_values[state])
146 |                 if yield_frequency == "step": yield state_values                 
147 |                 # timelimit : we artificially set the episode as done if the timelimit is reached
148 |                 if t >= timelimit: done = True
149 |                 # If done, we additonally learn V(s_next) to be 0.
150 |                 if done:
151 |                     state_values[next_state] += alpha * (0 - state_values[next_state])
152 | 
153 |                 state = next_state
154 |                 t += 1
155 |                 num_total_step += 1
156 | 
157 |             if yield_frequency == "episode": yield state_values
158 |             num_episode += 1
159 |         if yield_frequency == "iteration": yield state_values
160 | 
161 | 
162 | class SARSA:
163 | 
164 |     def find_action_values(self,policy : DiscretePolicyForDiscreteState,
165 |                                 env : gym.Env,
166 |                                 n_episodes : int = float("inf"),
167 |                                 n_steps : int = float("inf"),
168 |                                 gamma : float = 0.99,
169 |                                 alpha : float = 0.1,
170 |                                 timelimit : int = float("inf"),
171 |                                 initial_action_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array
172 |                                 typical_value : float = 1,
173 |                                 exploring_starts : bool = False,
174 |                                 is_state_done : Callable = None,
175 |                                 verbose : int = 1,
176 |                                     ) -> np.ndarray:
177 |         """This method performs SARSA for action values, an online on-policy TD Learning algorithm aiming to estimates the action value.
178 |         The algorithm stop after a certain number of episodes or steps done.
179 | 
180 |         policy : the policy to evaluate
181 |         env : the environment to evaluate the policy on
182 |         n_episodes : the maximal number of episodes of interaction with the env to perform the algorithm
183 |         n_steps : the maximal number of steps of interaction with the env to perform the algorithm
184 |         gamma : the discount factor
185 |         alpha : the learning rate
186 |         timelimit : the number of maximal steps in an episode. After that the episode will be considered done. Use for non terminal env.
187 |         initial_action_values : the initial values of the action values. Can be "random", "zeros", "optimistic" or a numpy array.
188 |         typical_value : the typical value of the action values. Used to initialize the action values if initial_action_values is "random".
189 |         exploring_starts : if True, the algorithm will start at a random-non terminal qstate. Use IF accessible env. Use for create minimum exploration in the case of deterministic src.policies.
190 |         is_state_done : a function returning whether a state is terminal. Used if exploring_starts is True for no initialization in the terminal states
191 |         verbose : the verbosity level. 0 for no output, 1 for output.
192 |         """
193 | 
194 |         assert n_episodes != float("inf") or n_steps != float("inf"), "Either n_episodes or n_steps must be specified."
195 |         assert not exploring_starts or is_state_done is not None, "is_state_done must be specified if exploring_starts is True."
196 | 
197 |         if verbose >= 1 : 
198 |             print(pretty_announcer(f"Start algorithm SARSA for Q.\nExploring starts : {exploring_starts}\nFor {n_episodes} episodes or {n_steps} steps."))
199 | 
200 |         # Initialize the state values
201 |         action_values = initialize_values(  shape = (policy.n_states, policy.n_actions),
202 |                                             initial_values = initial_action_values,
203 |                                             typical_value = typical_value)
204 |         num_episode = 0
205 |         num_total_step = 0
206 |         state = env.reset()
207 | 
208 |         while num_episode < n_episodes and num_total_step < n_steps:
209 |             if verbose >= 1 : print(f"SARSA Prediction of Q - Episode {num_episode}/{n_episodes} - Step {num_total_step}/{n_steps}")
210 |             # Initialize the qstate
211 |             state = env.reset()
212 |             action = np.random.choice(policy.n_actions, p=policy.probs[state])
213 | 
214 |             if exploring_starts:  # If exploring starts, we try to choose randomly a qstate (s,a) with s non terminal. This unsure minimum exploration.
215 |                 state_temp = np.random.choice(policy.n_states)
216 |                 if not is_state_done(state_temp):
217 |                     state = state_temp
218 |                     env.state = state
219 |                     action = np.random.choice(policy.n_actions)
220 | 
221 |             # Loop through the episode
222 |             t = 0
223 |             done = False
224 |             while not done and t < timelimit and num_total_step < n_steps:
225 |                 # Take action, observe the next state and reward, take next action
226 |                 next_state, reward, done, _ = env.step(action)
227 |                 next_action = np.random.choice(policy.n_actions, p=policy.probs[next_state])
228 |                 # Update the action values online
229 |                 action_values[state][action] += alpha * (reward + gamma * action_values[next_state][next_action] * (1-done) - action_values[state][action])             
230 |                 # timelimit : we artificially set the episode as done if the timelimit is reached
231 |                 if t >= timelimit: done = True
232 |                 # If done, we additonally learn Q(s_next, a_next) to be 0.
233 |                 if done:
234 |                     action_values[next_state][next_action] += alpha * (0 - action_values[next_state][next_action])  
235 | 
236 |                 # Update the qstate
237 |                 state = next_state
238 |                 action = next_action
239 |                 t += 1
240 |                 num_total_step += 1
241 |             
242 |             num_episode += 1
243 | 
244 |         if verbose >= 1:
245 |             print(f"SARSA Prediction of Q finished after {num_episode} episodes and {num_total_step} steps. Action values found : {action_values}")
246 | 
247 |         return action_values
248 | 
249 | 
250 | 
251 |     def find_action_values_yielding(self,   policy : DiscretePolicyForDiscreteState,
252 |                                             env : gym.Env,
253 |                                             n_episodes : int = float("inf"),
254 |                                             n_steps : int = float("inf"),
255 |                                             gamma : float = 0.99,
256 |                                             alpha : float = 0.1,
257 |                                             timelimit : int = float("inf"),
258 |                                             initial_action_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array
259 |                                             typical_value : float = 1,
260 |                                             exploring_starts : bool = False,
261 |                                             is_state_done : Callable = None,
262 |                                             yield_frequency : str = "step",
263 |                                             **kwargs,
264 |                                                 ) -> Iterator:
265 |         """
266 |         Same as find_action_values, but yields the action values at each step.
267 | 
268 |         yield_frequency : "step" or "episode" or "iteration", the frequency at which the action values are yielded.
269 |         """
270 | 
271 |         assert n_episodes != float("inf") or n_steps != float("inf"), "Either n_episodes or n_steps must be specified."
272 |         assert not exploring_starts or is_state_done is not None, "is_state_done must be specified if exploring_starts is True."
273 | 
274 |         # Initialize the state values
275 |         action_values = initialize_values(  shape = (policy.n_states, policy.n_actions),
276 |                                             initial_values = initial_action_values,
277 |                                             typical_value = typical_value)
278 |         if yield_frequency != "iterations" : yield action_values
279 |         num_episode = 0
280 |         num_total_step = 0
281 |         state = env.reset()
282 | 
283 |         while num_episode < n_episodes and num_total_step < n_steps:
284 |             # Initialize the qstate
285 |             state = env.reset()
286 |             action = np.random.choice(policy.n_actions, p=policy.probs[state])
287 |             # Loop through the episode
288 |             t = 0
289 |             done = False
290 |             while not done and t < timelimit and num_total_step < n_steps:
291 |                 yield f"SARSA Prediction of Q - Episode {num_episode}/{n_episodes} - Step {num_total_step}/{n_steps}"
292 |                 # Take action, observe the next state and reward
293 |                 if not exploring_starts or t>=1:
294 |                     next_state, reward, done, _ = env.step(action)
295 |                     next_action = np.random.choice(policy.n_actions, p=policy.probs[next_state])
296 |                 else:
297 |                     pass
298 |                 # Update the action values online
299 |                 action_values[state][action] += alpha * (reward + gamma * action_values[next_state][next_action] * (1-done) - action_values[state][action])  
300 |                 if yield_frequency == "step": yield action_values           
301 |                 # timelimit : we artificially set the episode as done if the timelimit is reached
302 |                 if t >= timelimit: done = True
303 |                 # If done, we additonally learn Q(s_next, a_next) to be 0.
304 |                 if done:
305 |                     action_values[next_state][next_action] += alpha * (0 - action_values[next_state][next_action])  
306 | 
307 |                 state = next_state
308 |                 action = next_action
309 |                 t += 1
310 |                 num_total_step += 1
311 |             
312 |             if yield_frequency == "episode": yield action_values
313 |             num_episode += 1
314 |         if yield_frequency == "iteration": yield action_values
315 | 
316 | 
317 |     def find_optimal_policy(self,   env : gym.Env,
318 |                                     gamma : float = 1,
319 |                                     n_episodes : int = float("inf"),
320 |                                     n_steps : int = float("inf"),
321 |                                     exploration_method : str = "epsilon_greedy", # "epsilon_greedy" or "UCB"
322 |                                     epsilon : Union[float, Scheduler] = 0.1,
323 |                                     alpha : float = 0.1,
324 |                                     timelimit : int = float("inf"),
325 |                                     initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array
326 |                                     typical_value : float = 1,
327 |                                     return_action_values : bool = False,
328 |                                     is_state_done : Callable = None,
329 |                                     verbose : int = 1,
330 |                                     ) -> Union[DiscretePolicyForDiscreteState, Tuple[DiscretePolicyForDiscreteState, np.ndarray]]:
331 |         """This method performs SARSA Control, an on-policy online Control algorithm.
332 |         It aims to find the optimal policy (among an explorative subset of every src.policies).
333 | 
334 |         env : the envirronment to learn from
335 |         gamma : the discount factor
336 |         n_episodes : the number of episodes to learn from
337 |         exploration_method : the method to use for exploration ("epsilon_greedy", "UCB", "exploring_starts" or "greedy")
338 |         epsilon : the epsilon parameter for the epsilon-greedy method, can be a scalar or a Scheduler that returns a scalar given a timestep/episode
339 |         alpha : the alpha parameter for the moving average method
340 |         timelimit : the timelimit of the episode (use for non terminal env)
341 |         initial_values : the initial values for the action values ("random", "zeros", "optimistic" or a numpy array)
342 |         typical_value : the typical value for the action values, used for scaling the "random" and "optimistic" value-initialization methods.
343 |         return_action_values : if True, the method returns the action values along  with the policy
344 |         is_state_done : function return whether a state is terminal, used for the "exploring_starts" method
345 |         verbose : the verbosity level
346 |         """
347 | 
348 |         if verbose >= 1 : 
349 |             print(pretty_announcer(f"Start algorithm SARSA Control.\nExploration method used : {exploration_method}\nFor {n_episodes} episodes or {n_steps} steps."))
350 | 
351 |         assert n_episodes != float("inf") or n_steps != float("inf"), "Either n_episodes or n_steps must be specified."
352 |         assert exploration_method in ["epsilon_greedy", "UCB", "exploring_starts", "greedy"], "Unknown exploration method : {}".format(exploration_method)
353 |         assert n_episodes > 0, "The number of episodes must be positive."
354 | 
355 |         # Initialize the action values
356 |         n_states, n_actions = env.observation_space.n, env.action_space.n
357 |         action_values = initialize_values(  shape = (n_states, n_actions),
358 |                                             initial_values = initial_action_values,
359 |                                             typical_value = typical_value)
360 | 
361 |         # Loop through the episodes
362 |         num_episode = 0
363 |         num_total_step = 0
364 |         state = env.reset()
365 | 
366 |         while num_episode < n_episodes:
367 |             if verbose >= 1 : print(f"SARSA Control - Episode {num_episode}/{n_episodes}")
368 |             # Initialize the qstate
369 |             state = env.reset()
370 |             if exploration_method == "greedy":
371 |                 action = np.argmax(action_values[state])
372 |             elif exploration_method == "epsilon_greedy":
373 |                 eps = epsilon if np.isscalar(epsilon) else epsilon(timestep=num_total_step, episode=num_episode)
374 |                 action = np.random.choice(n_actions) if np.random.random() < eps else np.argmax(action_values[state])
375 |             elif exploration_method == "UCB":
376 |                 raise NotImplementedError("UCB exploration method is not implemented yet.")
377 |             elif exploration_method == "exploring_starts":
378 |                 assert is_state_done is not None, "is_state_done must be specified if exploring_starts is True."
379 |                 state_temp = np.random.choice(n_states)
380 |                 if is_state_done(state_temp):
381 |                     action = np.argmax(action_values[state_temp])
382 |                 else:
383 |                     action = np.random.choice(n_actions)
384 |                     state = state_temp
385 |                     env.state = state_temp
386 |             else:
387 |                 raise NotImplementedError("Unknown exploration method : {}".format(exploration_method))
388 | 
389 |             # Loop through the episode
390 |             t=0
391 |             done = False
392 |             while not done and t < timelimit and num_total_step < n_steps:
393 |                 # Take action, observe the next state and reward, choose next action
394 |                 next_state, reward, done, _ = env.step(action)
395 |                 if exploration_method == "greedy" or exploration_method == "exploring_starts":
396 |                     next_action = np.argmax(action_values[next_state])
397 |                 elif exploration_method == "epsilon_greedy":
398 |                     eps = epsilon if np.isscalar(epsilon) else epsilon(timestep=num_total_step, episode=num_episode)
399 |                     next_action = np.random.choice(n_actions) if np.random.random() < eps else np.argmax(action_values[next_state])
400 |                 elif exploration_method == "UCB":
401 |                     raise NotImplementedError("UCB exploration method is not implemented yet.")
402 |                 else:
403 |                     raise NotImplementedError("Unknown exploration method : {}".format(exploration_method))
404 |                 # Update the action values online
405 |                 action_values[state][action] += alpha * (reward + gamma * action_values[next_state][next_action] * (1-done) - action_values[state][action])
406 |                 # timelimit : we artificially set the episode as done if the timelimit is reached
407 |                 if t >= timelimit: done = True
408 |                 # If done, we additonally learn Q(s_next, a_next) to be 0, since by conventon values of terminal states are 0
409 |                 if done:
410 |                     action_values[next_state][next_action] += alpha * (0 - action_values[next_state][next_action])  
411 | 
412 |                 # Update the state and action
413 |                 state = next_state
414 |                 action = next_action
415 |                 t += 1
416 |                 num_total_step += 1
417 | 
418 |             num_episode += 1
419 | 
420 |         if verbose >= 1:
421 |             print(f"SARSA Control finished after {num_episode} episodes and {num_total_step} steps. Action values found : {action_values}")
422 | 
423 |         probs = np.array([[int(action == np.argmax(action_values[state])) for action in range(n_actions)] for state in range(n_states)])
424 |         optimal_policy = DiscretePolicyForDiscreteState(probs)
425 |         if return_action_values: 
426 |             return optimal_policy, action_values
427 |         else:
428 |             return optimal_policy
429 | 
430 |     
431 | 
432 |     def find_optimal_policy_yielding(self,  env : gym.Env,
433 |                                             gamma : float = 1,
434 |                                             n_episodes : int = float("inf"),
435 |                                             n_steps : int = float("inf"),
436 |                                             exploration_method : str = "epsilon_greedy", # "epsilon_greedy" or "UCB"
437 |                                             epsilon : Union[float, Scheduler] = 0.1,
438 |                                             alpha : float = 0.1,
439 |                                             timelimit : int = float("inf"),
440 |                                             initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array
441 |                                             typical_value : float = 1,
442 |                                             return_action_values : bool = False,
443 |                                             is_state_done : Callable = None,
444 |                                             yielding_frequency : str = "step", # "step" or "episode"
445 |                                             **kwargs,
446 |                                             ) -> Iterator:
447 |         """Same as find_optimal_policy, but yields the action values along with the actions through the training
448 |         
449 |         yield_frequency : "step" or "episode", the frequency at which the state values are yielded.
450 |         """
451 |         assert n_episodes != float("inf") or n_steps != float("inf"), "Either n_episodes or n_steps must be specified."
452 |         assert exploration_method in ["epsilon_greedy", "UCB", "exploring_starts", "greedy"], "Unknown exploration method : {}".format(exploration_method)
453 |         assert n_episodes > 0, "The number of episodes must be positive."
454 | 
455 |         # Initialize the action values
456 |         n_states, n_actions = env.observation_space.n, env.action_space.n
457 |         action_values = initialize_values(  shape = (n_states, n_actions),
458 |                                             initial_values = initial_action_values,
459 |                                             typical_value = typical_value)
460 |         greedy_actions = np.argmax(action_values, axis=1)
461 |         yield greedy_actions
462 |         yield action_values
463 | 
464 |         # Loop through the episodes
465 |         num_episode = 0
466 |         num_total_step = 0
467 |         state = env.reset()
468 | 
469 |         while num_episode < n_episodes:
470 |             yield f"SARSA Control - Episode {num_episode}/{n_episodes} - Step {num_total_step}/{n_steps}"
471 |             # Initialize the qstate
472 |             state = env.reset()
473 |             if exploration_method == "greedy":
474 |                 action = np.argmax(action_values[state])
475 |             elif exploration_method == "epsilon_greedy":
476 |                 eps = epsilon if np.isscalar(epsilon) else epsilon(timestep=num_total_step, episode=num_episode)
477 |                 action = np.random.choice(n_actions) if np.random.random() < eps else np.argmax(action_values[state])
478 |             elif exploration_method == "UCB":
479 |                 raise NotImplementedError("UCB exploration method is not implemented yet.")
480 |             elif exploration_method == "exploring_starts":
481 |                 assert is_state_done is not None, "is_state_done must be specified if exploring_starts is True."
482 |                 state_temp = np.random.choice(n_states)
483 |                 if is_state_done(state_temp):
484 |                     action = np.argmax(action_values[state_temp])
485 |                 else:
486 |                     action = np.random.choice(n_actions)
487 |                     state = state_temp
488 |                     env.state = state_temp
489 |             else:
490 |                 raise NotImplementedError("Unknown exploration method : {}".format(exploration_method))
491 | 
492 |             # Loop through the episode
493 |             t=0
494 |             done = False
495 |             while not done and t < timelimit and num_total_step < n_steps:
496 |                 # Take action, observe the next state and reward, choose next action
497 |                 next_state, reward, done, _ = env.step(action)
498 |                 if exploration_method == "greedy" or exploration_method == "exploring_starts":
499 |                     next_action = np.argmax(action_values[next_state])
500 |                 elif exploration_method == "epsilon_greedy":
501 |                     eps = epsilon if np.isscalar(epsilon) else epsilon(timestep=num_total_step, episode=num_episode)
502 |                     next_action = np.random.choice(n_actions) if np.random.random() < eps else np.argmax(action_values[next_state])
503 |                 elif exploration_method == "UCB":
504 |                     raise NotImplementedError("UCB exploration method is not implemented yet.")
505 |                 else:
506 |                     raise NotImplementedError("Unknown exploration method : {}".format(exploration_method))
507 |                 # Update the action values online
508 |                 action_values[state][action] += alpha * (reward + gamma * action_values[next_state][next_action] * (1-done) - action_values[state][action])
509 |                 # timelimit : we artificially set the episode as done if the timelimit is reached
510 |                 if t >= timelimit: done = True
511 |                 # If done, we additonally learn Q(s_next, a_next) to be 0, since by conventon values of terminal states are 0
512 |                 if done:
513 |                     action_values[next_state][next_action] += alpha * (0 - action_values[next_state][next_action])  
514 | 
515 |                 # Update the state and action
516 |                 state = next_state
517 |                 action = next_action
518 |                 t += 1
519 |                 num_total_step += 1
520 | 
521 |                 if yielding_frequency == "step":
522 |                     greedy_actions = np.argmax(action_values, axis=1)
523 |                     yield action_values
524 |                     yield greedy_actions
525 | 
526 |             greedy_actions = np.argmax(action_values, axis=1)
527 |             yield action_values
528 |             yield greedy_actions
529 |             num_episode += 1
530 | 
531 | 


--------------------------------------------------------------------------------
/DP/dynamicProgramming.py:
--------------------------------------------------------------------------------
  1 | from time import sleep, time
  2 | from typing import Iterator, Tuple, Union
  3 | import numpy as np
  4 | 
  5 | from src.policies import *
  6 | from src.utils import * 
  7 | 
  8 | class IterativePolicyEvaluation:
  9 | 
 10 |     def find_state_values(self,  policy : DiscretePolicyForDiscreteState, 
 11 |                                 transition_probability : np.ndarray,
 12 |                                 reward_probability : np.ndarray,
 13 |                                 n_iterations : int = None, 
 14 |                                 maximal_error : float = None,
 15 |                                 gamma : float = 1,
 16 |                                 sweep_order : str = "normal", # "normal" or "reverse" or "random"
 17 |                                 initial_state_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array
 18 |                                 typical_value : float = 1,
 19 |                                 verbose = 1,
 20 |                                 **kwargs,
 21 |                                 ) -> np.ndarray:
 22 |         """This method perform the IterativePolicyEvaluation algorithm. It computes an estimation of the state values for a given policy, in a given model (transition_probability and reward_probability).
 23 |         The algorithm stop either after a given number of iterations or when the worst error (among the states) between two V(s) estimation consecutive is below a given threshold.
 24 |         
 25 |         transition_probability : a numpy array of shape (n_states, n_actions, n_states) representing the transition probability between states.
 26 |         reward_probability : a numpy array of shape (n_states, n_actions) representing the reward probability for each action in each state.
 27 |         n_iterations : the number of iterations to perform.
 28 |         maximal_error : the error between 2 consecutives state value below what the algorithm will stop, considering that it has converged.
 29 |         gamma : the discount factor.
 30 |         sweep_order : the order in which we will iterate over the states. "normal" or "reverse" or "random". This can have a significant impact on the convergence of the algorithm.
 31 |         initial_state_values : the initial values of the state values. Can be "random", "zeros", "optimistic" or a numpy array.
 32 |         typical_value : the typical value of the state values. Used to initialize the state values if initial_state_values is "random".
 33 |         verbose : the verbosity level, 0 for no output, 1 for an end output.
 34 |         """
 35 | 
 36 |         assert n_iterations != None or maximal_error != None, "The stop condition is not well defined. Please specify either n_iterations or maximal_error."
 37 | 
 38 |         # Define the order in which we will iterate over the states
 39 |         n_states, n_actions = reward_probability.shape
 40 |         states_sweep = np.arange(n_states)
 41 |         if sweep_order == "reverse":
 42 |             states_sweep = np.flip(states_sweep)
 43 |         elif sweep_order == "random":
 44 |             np.random.shuffle(states_sweep)
 45 | 
 46 |         # Initialize the state values   
 47 |         state_values = initialize_values(   shape = (n_states,),
 48 |                                             initial_values=initial_state_values,
 49 |                                             typical_value=typical_value)
 50 |         n_iter = 0
 51 |         keep_iterating = True
 52 | 
 53 |         while keep_iterating:
 54 |             worst_error = 0
 55 |             # Iterate over the states, update state value in an in-place manner (using only one array).
 56 |             for state in range(n_states):
 57 |                 value = state_values[state]
 58 |                 state_values[state] = self.compute_state_value(state, policy, transition_probability, reward_probability, state_values, gamma)
 59 |                 worst_error = max(worst_error, abs(value - state_values[state]))
 60 |             # Stop algorithm if we reached the maximum number of iterations or if the error is below the threshold
 61 |             n_iter += 1
 62 |             if n_iterations != None and n_iter >= n_iterations:
 63 |                 keep_iterating = False
 64 |                 if verbose >= 1: print("The algorithm stopped after {} iterations. Stop condition : number of iteration reached.".format(n_iter))
 65 |             elif maximal_error != None and worst_error <= maximal_error:
 66 |                 keep_iterating = False
 67 |                 if verbose >= 1: print("The algorithm stopped after {} iterations. Stop condition : worst error ({}) inferior to the maximal error asked ({})".format(n_iter, worst_error, maximal_error))
 68 |         
 69 |         return state_values
 70 | 
 71 |         
 72 | 
 73 |     def compute_state_value(self,   state : int, 
 74 |                                     policy : DiscretePolicyForDiscreteState, 
 75 |                                     transition_probability : np.ndarray,
 76 |                                     reward_probability : np.ndarray,
 77 |                                     state_values : np.ndarray,
 78 |                                     gamma : float) -> float:
 79 |         """This function compute the state value for a given state, a given policy, a given model (transition_probability and reward_probability), and for the state values vector.
 80 |         It applies the Bellman Operator to state values (the Bellman Operator is the right term in the Dynamic Bellman Equation for state values).
 81 |         """
 82 |         n_states, n_actions = reward_probability.shape
 83 |         value = 0
 84 |         for action in range(n_actions):
 85 |             value += policy.get_prob(state, action) * (reward_probability[state, action] + 
 86 |                                                         gamma * transition_probability[state, action, :].dot(state_values))
 87 |         return value
 88 | 
 89 | 
 90 | 
 91 |     
 92 |     def find_state_values_yielding(self,  policy : DiscretePolicyForDiscreteState, 
 93 |                                 transition_probability : np.ndarray,
 94 |                                 reward_probability : np.ndarray,
 95 |                                 n_iterations : int = None, 
 96 |                                 maximal_error : float = None,
 97 |                                 gamma : float = 1,
 98 |                                 sweep_order : str = "normal", # "normal" or "reverse" or "random"
 99 |                                 initial_state_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array
100 |                                 typical_value : float = 1,
101 |                                 yield_frequency : str = "step", # "step" or "iteration"
102 |                                 **kwargs,
103 |                                 ) -> Iterator:
104 |         """This function is the same as find_state_values, but it yields the state values at each iteration. Use for observe the convergence of the algorithm.
105 | 
106 |         yield_frequency : "step" or "iteration", the frequency at which the state values are yielded.
107 |         """
108 | 
109 |         assert n_iterations != None or maximal_error != None, "The stop condition is not well defined. Please specify either n_iterations or maximal_error."
110 |         
111 |         n_states, n_actions = reward_probability.shape
112 |         states_sweep = np.arange(n_states)
113 |         if sweep_order == "reverse":
114 |             states_sweep = np.flip(states_sweep)
115 |         elif sweep_order == "random":
116 |             np.random.shuffle(states_sweep)
117 | 
118 |         state_values = initialize_values(   shape = (n_states,),
119 |                                             initial_values=initial_state_values,
120 |                                             typical_value=typical_value)
121 |         if yield_frequency != "global_iteration": yield state_values
122 |         n_iter = 0
123 |         keep_iterating = True
124 | 
125 |         while keep_iterating:
126 |             worst_error = 0
127 |             yield f"DP Prediction of V (IPE) - Iteration {n_iter} :"
128 |             for state in states_sweep:
129 |                 value = state_values[state]
130 |                 state_values[state] = self.compute_state_value(state, policy, transition_probability, reward_probability, state_values, gamma)
131 |                 worst_error = max(worst_error, abs(value - state_values[state]))
132 |                 if yield_frequency == "step" : yield state_values
133 |             n_iter += 1
134 |             if n_iterations != None and n_iter >= n_iterations:
135 |                 keep_iterating = False
136 |             elif maximal_error != None and worst_error <= maximal_error:
137 |                 keep_iterating = False
138 |             if yield_frequency == "iteration": yield state_values
139 |         if yield_frequency == "global_iteration" : yield state_values
140 | 
141 |     
142 | 
143 |     def find_action_values(self,    policy : DiscretePolicyForDiscreteState,
144 |                                     transition_probability : np.ndarray,
145 |                                     reward_probability : np.ndarray,
146 |                                     n_iterations : int = None, 
147 |                                     maximal_error : float = None,
148 |                                     gamma : float = 1,
149 |                                     sweep_order : str = "random", # "normal" or "reverse" or "random"
150 |                                     initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array
151 |                                     typical_value : float = 1,
152 |                                     verbose = 1,
153 |                                     **kwargs,
154 |                                     ) -> np.ndarray:
155 |         
156 |         """This method perform the IterativePolicyEvaluation algorithm. It computes an estimation of the action values for a given policy, in a given model (transition_probability and reward_probability).
157 |         The algorithm stop either after a given number of iterations or when the worst error (among the states+actions) between two Q(s,a) estimation consecutive is below a given threshold.
158 |         
159 |         transition_probability : a numpy array of shape (n_states, n_actions, n_states) representing the transition probability between states.
160 |         reward_probability : a numpy array of shape (n_states, n_actions) representing the reward probability for each action in each state.
161 |         n_iterations : the number of iterations to perform.
162 |         maximal_error : the error between 2 consecutives state value below what the algorithm will stop, considering that it has converged.
163 |         gamma : the discount factor.
164 |         sweep_order : the order in which we will iterate over the states. "normal" or "reverse" or "random". This can have a significant impact on the convergence of the algorithm.
165 |         initial_action_values : the initial values of the action values. Can be "random", "zeros", "optimistic" or a numpy array.
166 |         typical_value : the typical value of the action values. Used to initialize the action values if initial_action_values is "random".
167 |         verbose : the verbosity level, 0 for no output, 1 for an end output.
168 |         """
169 |         
170 |         assert n_iterations != None or maximal_error != None, "The stop condition is not well defined. Please specify either n_iterations or maximal_error."
171 | 
172 |         # Define the order in which we will iterate over the states
173 |         n_states, n_actions = reward_probability.shape
174 |         states_sweep = np.arange(n_states)
175 |         if sweep_order == "reverse":
176 |             states_sweep = np.flip(states_sweep)
177 |         elif sweep_order == "random":
178 |             np.random.shuffle(states_sweep)
179 | 
180 |         # Initialize the action values
181 |         action_values = initialize_values(  shape = (n_states, n_actions), 
182 |                                             initial_values = initial_action_values, 
183 |                                             typical_value = typical_value)
184 |         n_iter = 0
185 |         keep_iterating = True
186 | 
187 |         while keep_iterating:
188 |             worst_error = 0
189 |             # Iterate over the states and actions, update actions values value in an in-place manner (using only one array).
190 |             for state in states_sweep:
191 |                 for action in range(n_actions):
192 |                     value = action_values[state][action]
193 |                     action_values[state][action] = self.compute_action_value(state, action, policy, transition_probability, reward_probability, action_values, gamma)
194 |                     worst_error = max(worst_error, abs(value - action_values[state][action]))
195 |             # Stop algorithm if we reached the maximum number of iterations or if the error is below the threshold
196 |             n_iter += 1
197 |             if n_iterations != None and n_iter >= n_iterations:
198 |                 keep_iterating = False
199 |                 if verbose >= 1: print("The algorithm stopped after {} iterations. Stop condition : number of iteration reached.".format(n_iter))
200 |             elif maximal_error != None and worst_error <= maximal_error:
201 |                 keep_iterating = False
202 |                 if verbose >= 1: print("The algorithm stopped after {} iterations. Stop condition : worst error ({}) inferior to the maximal error asked ({})".format(n_iter, worst_error, maximal_error))
203 | 
204 |         return action_values
205 | 
206 |     
207 |     def compute_action_value(self,  state : int, 
208 |                                     action : int,
209 |                                     policy : DiscretePolicyForDiscreteState, 
210 |                                     transition_probability : np.ndarray,
211 |                                     reward_probability : np.ndarray,
212 |                                     q_values : np.ndarray,
213 |                                     gamma : float) -> float:
214 |         """This function compute the action value for a given state, action, a given policy, a given model (transition_probability and reward_probability), and for the action values vector.
215 |         It applies the Bellman Operator to action values (the Bellman Operator is the right term in the Dynamic Bellman Equation for action values).
216 |         """
217 |         n_states, n_actions = reward_probability.shape
218 |         value = reward_probability[state, action]
219 |         for next_state in range(n_states):
220 |             value += gamma * transition_probability[state, action, next_state] * policy.probs[next_state].dot(q_values[next_state])
221 |         return value
222 | 
223 | 
224 |     def find_action_values_yielding(self,   policy : DiscretePolicyForDiscreteState,
225 |                                             transition_probability : np.ndarray,
226 |                                             reward_probability : np.ndarray,
227 |                                             n_iterations : int = None, 
228 |                                             maximal_error : float = None,
229 |                                             gamma : float = 1,
230 |                                             sweep_order : str = "random", # "normal" or "reverse" or "random"
231 |                                             initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array
232 |                                             typical_value : float = 1,
233 |                                             yield_frequency : str = "step", # "step" or "iteration"
234 |                                             **kwargs,
235 |                                             ) -> Iterator:
236 |         
237 |         """This function is the same as find_action_values, but it yields the action values at each iteration. Use for observe the convergence of the algorithm.
238 | 
239 |         yield_frequency : "step" or "iteration", the frequency at which the action values are yielded.
240 |         """
241 | 
242 |         assert n_iterations != None or maximal_error != None, "The stop condition is not well defined. Please specify either n_iterations or maximal_error."
243 | 
244 |         n_states, n_actions = reward_probability.shape
245 |         states_sweep = np.arange(n_states)
246 |         if sweep_order == "reverse":
247 |             states_sweep = np.flip(states_sweep)
248 |         elif sweep_order == "random":
249 |             np.random.shuffle(states_sweep)
250 | 
251 |         action_values = initialize_values(  shape = (n_states, n_actions), 
252 |                                             initial_values = initial_action_values, 
253 |                                             typical_value = typical_value)
254 |         if yield_frequency != "global_iteration": yield action_values
255 |         n_iter = 0
256 |         keep_iterating = True
257 | 
258 |         while keep_iterating:
259 |             worst_error = 0
260 |             yield f"DP Prediction of Q (IPE) - Iteration {n_iter} :"
261 |             for state in states_sweep:
262 |                 for action in range(n_actions):
263 |                     value = action_values[state][action]
264 |                     action_values[state][action] = self.compute_action_value(state, action, policy, transition_probability, reward_probability, action_values, gamma)
265 |                     worst_error = max(worst_error, abs(value - action_values[state][action]))
266 |                     if yield_frequency == "step" : yield action_values
267 |             n_iter += 1
268 |             if n_iterations != None and n_iter >= n_iterations:
269 |                 keep_iterating = False
270 |             elif maximal_error != None and worst_error <= maximal_error:
271 |                 keep_iterating = False
272 |             if yield_frequency == "iteration": yield action_values
273 |         if yield_frequency == "global_iteration" : yield action_values
274 | 
275 | 
276 | 
277 | class PolicyIteration:
278 | 
279 |     def find_optimal_policy(self,   transition_probability : np.ndarray,
280 |                                     reward_probability : np.ndarray,
281 |                                     n_iterations : int = float("inf"),
282 |                                     IPE_n_iterations : int = None, 
283 |                                     IPE_maximal_error : float = None,
284 |                                     gamma : float = 1,
285 |                                     sweep_order : str = "normal", # "normal" or "reverse" or "random"
286 |                                     initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array
287 |                                     typical_value : float = 1,
288 |                                     return_action_values : bool = False,
289 |                                     verbose : int = 1,
290 |                                     stop_if_policy_stable = True,
291 |                                     ) -> DiscretePolicyForDiscreteState :
292 | 
293 |         """This method performs the Policy Iteration algorithm. It computes an optimal policy for a given model (transition_probability and reward_probability).
294 |         The algorithm stop either when the policy is stable (no change in the policy) or when the number of iterations is reached.
295 | 
296 |         transition_probability : a numpy array of shape (n_states, n_actions, n_states) representing the transition probability between states and actions.
297 |         reward_probability : a numpy array of shape (n_states, n_actions) representing the reward expected for each state and action.
298 |         n_iterations : the number of iterations for the policy iteration algorithm.
299 |         IPE_n_iterations : the number of iterations for the IPE algorithm.
300 |         IPE_maximal_error : the maximal error allowed for the IPE algorithm.
301 |         gamma : the discount factor
302 |         sweep_order : the order in which we will iterate over the states. "normal" or "reverse" or "random". This can have a significant impact on the convergence of the algorithm.
303 |         initial_values : the initial values for the action values ("random", "zeros", "optimistic" or a numpy array)
304 |         typical_value : the typical value for the action values, used for scaling the "random" and "optimistic" value-initialization methods.
305 |         return_action_values : if True, the action values are returned with the policy
306 |         verbose : the verbosity level. 0 : no print, 1 : print when PI has finished.
307 |         stop_if_policy_stable : if True, the algorithm stops when the policy is stable because it consider the policy has converged.
308 |         """
309 |         assert n_iterations >= 1, "The number of iterations must be strictly positive."
310 | 
311 |         if IPE_maximal_error is None and IPE_n_iterations is None:
312 |             IPE_maximal_error = 0.01
313 | 
314 |         n_states, n_actions = reward_probability.shape
315 |         actions = np.random.choice(np.array([a for a in range(n_actions)]), size = n_states,)
316 |         action_values = initialize_values( shape = (n_states, n_actions), 
317 |                                                     initial_values = initial_action_values, 
318 |                                                     typical_value = typical_value)
319 |         algo_IPE = IterativePolicyEvaluation()
320 | 
321 |         n_iter = 0
322 |         while n_iter < n_iterations:
323 | 
324 |             #Iterative Policy Evaluation
325 |             probs = np.zeros((n_states, n_actions))         # convert deterministic actions to stochastic policy
326 |             probs[np.arange(n_states), actions] = 1
327 |             policy = DiscretePolicyForDiscreteState(probs)
328 |             action_values = algo_IPE.find_action_values(policy,                                 #Evaluate the policy
329 |                                                         transition_probability, 
330 |                                                         reward_probability,                                    
331 |                                                         n_iterations = IPE_n_iterations,        #Convergence criteria for the IPE
332 |                                                         maximal_error = IPE_maximal_error,
333 |                                                         gamma = gamma,
334 |                                                         sweep_order=sweep_order,
335 | 
336 |                                                         initial_action_values = action_values,  #Initialize the IPE with the previous action values computed, increase convergence a bit
337 |                                                         verbose = 0,                            #Silence the IPE method
338 |                                                         )
339 |             #Policy improvement
340 |             actions_old = actions.copy()
341 |             for state in range(n_states):
342 |                 actions[state] = np.argmax(action_values[state])
343 |             
344 |             n_iter += 1
345 |             if stop_if_policy_stable and (actions == actions_old).all():
346 |                 break
347 |         
348 |         if verbose >= 1:
349 |             if n_iter < n_iterations:
350 |                 print("Policy Iteration stopped after {} iterations. Stop condition : policy is stable.".format(n_iter))
351 |             else:
352 |                 print("Policy Iteration stopped after {} iterations. Stop condition : maximal number of iterations reached.".format(n_iter))
353 | 
354 |         if return_action_values:
355 |             return policy, action_values
356 |         else:
357 |             return policy
358 |     
359 |     
360 | 
361 |     def find_optimal_policy_yielding(self,  transition_probability : np.ndarray,
362 |                                             reward_probability : np.ndarray,
363 |                                             IPE_n_iterations : int = None, 
364 |                                             IPE_maximal_error : float = None,
365 |                                             n_iterations : int = float("inf"),
366 |                                             gamma : float = 1,
367 |                                             sweep_order : str = "normal", # "normal" or "reverse" or "random"
368 |                                             initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array
369 |                                             typical_value : float = 1,
370 |                                             yield_frequency : str = "step", # "step", "iteration" or "global_iteration"
371 |                                             stop_if_policy_stable = True,
372 |                                             **kwargs,
373 |                                             ) -> Iterator:
374 | 
375 |         """This function is the same as find_optimal_policy, but it yields the actions and action values at each iteration. Use for observe the convergence of the algorithm.
376 | 
377 |         yield_frequency : "step" or "iteration", the frequency at which the state values are yielded.
378 |         """
379 |         assert n_iterations >= 1, "The number of iterations must be strictly positive."
380 | 
381 |         if IPE_maximal_error is None and IPE_n_iterations is None:
382 |             IPE_maximal_error = 0.01
383 | 
384 |         n_states, n_actions = reward_probability.shape
385 |         actions = np.random.choice(np.array([a for a in range(n_actions)]), size = n_states,)
386 |         action_values = initialize_values( shape = (n_states, n_actions), 
387 |                                                     initial_values = initial_action_values, 
388 |                                                     typical_value = typical_value)
389 |         yield actions
390 |         yield action_values
391 |         algo_IPE = IterativePolicyEvaluation()
392 | 
393 |         n_iter = 0
394 |         while n_iter < n_iterations:
395 |             yield f"DP Control (PI or VI) - Iteration {n_iter}"
396 |             #Iterative Policy Evaluation
397 |             probs = np.zeros((n_states, n_actions))         # convert deterministic actions to stochastic policy
398 |             probs[np.arange(n_states), actions] = 1
399 |             policy = DiscretePolicyForDiscreteState(probs)
400 |             for action_values_or_str in algo_IPE.find_action_values_yielding(   policy,                                 #Evaluate the policy
401 |                                                                     transition_probability, 
402 |                                                                     reward_probability,                                    
403 |                                                                     n_iterations = IPE_n_iterations,        #Convergence criteria for the IPE
404 |                                                                     maximal_error = IPE_maximal_error,
405 |                                                                     gamma = gamma,
406 |                                                                     sweep_order=sweep_order,
407 | 
408 |                                                                     initial_action_values = action_values,  #Initialize the IPE with the previous action values computed, increase convergence a bit
409 |                                                                     yield_frequency=yield_frequency,
410 |                                                                     ):
411 |                 yield action_values_or_str
412 |             #Policy improvement
413 |             actions_old = actions.copy()
414 |             for state in range(n_states):
415 |                 actions[state] = np.argmax(action_values[state])
416 |             yield actions
417 |             yield action_values
418 |             n_iter += 1
419 |             if stop_if_policy_stable and (actions == actions_old).all():
420 |                 break
421 |         
422 | 
423 | 
424 | class ValueIteration(PolicyIteration):
425 |     
426 |     algo_PI = PolicyIteration()
427 | 
428 |     def find_optimal_policy(self,   transition_probability : np.ndarray,
429 |                                     reward_probability : np.ndarray,
430 |                                     n_iterations : int = None, 
431 |                                     gamma : float = 1,
432 |                                     sweep_order : str = "normal", # "normal" or "reverse" or "random"
433 |                                     initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array
434 |                                     typical_value : float = 1,
435 |                                     return_action_values : bool = False,
436 |                                     verbose : int = 1,
437 |                                     ) -> DiscretePolicyForDiscreteState:
438 |         """This class implements the Value Iteration algorithm. It computes an optimal value function for a given model (transition_probability and reward_probability).
439 |         The algorithm stop either when the value function is stable (no change in the value function) or when the number of iterations is reached.
440 | 
441 |         transition_probability : a numpy array of shape (n_states, n_actions, n_states) representing the transition probability between states and actions.
442 |         reward_probability : a numpy array of shape (n_states, n_actions) representing the reward expected for each state and action.
443 |         n_iterations : the number of iterations for the policy iteration algorithm.
444 |         gamma : the discount factor
445 |         sweep_order : the order in which we will iterate over the states. "normal" or "reverse" or "random". This can have a significant impact on the convergence of the algorithm.
446 |         initial_values : the initial values for the action values ("random", "zeros", "optimistic" or a numpy array)
447 |         typical_value : the typical value for the action values, used for scaling the "random" and "optimistic" value-initialization methods.
448 |         return_action_values : if True, the action values are returned with the policy
449 |         verbose : the verbosity level. 0 : no print, 1 : print when PI has finished.
450 |         """
451 |         results = self.algo_PI.find_optimal_policy( transition_probability = transition_probability,
452 |                                                     reward_probability = reward_probability,
453 |                                                     n_iterations=n_iterations,
454 |                                                     IPE_n_iterations=1,
455 |                                                     gamma = gamma,
456 |                                                     sweep_order=sweep_order,
457 |                                                     initial_action_values=initial_action_values,
458 |                                                     typical_value=typical_value,
459 |                                                     return_action_values = return_action_values,
460 |                                                     stop_if_policy_stable = False,
461 |                                                     verbose = 0,)
462 |         
463 |         if verbose >= 1:
464 |             print("Value Iteration finished.")
465 |         
466 |         return results
467 | 
468 | 
469 | 
470 |     def find_optimal_policy_yielding(self,  transition_probability : np.ndarray,
471 |                                             reward_probability : np.ndarray,
472 |                                             n_iterations : int = None, 
473 |                                             gamma : float = 1,
474 |                                             sweep_order : str = "normal", # "normal" or "reverse" or "random"
475 |                                             initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array
476 |                                             typical_value : float = 1,
477 |                                             yield_frequency : str = "step", # "step", "iteration" or "global_iteration"
478 |                                             **kwargs,
479 |                                             ) -> Iterator:
480 |         """This method performs the Policy Iteration algorithm as find_optimal_policy but yield pi(s) (the actions) and Q(s,a).
481 |         """
482 |         results = self.algo_PI.find_optimal_policy_yielding(    transition_probability = transition_probability,
483 |                                                                 reward_probability = reward_probability,
484 |                                                                 n_iterations=n_iterations,
485 |                                                                 IPE_n_iterations=1,
486 |                                                                 gamma = gamma,
487 |                                                                 sweep_order=sweep_order,
488 |                                                                 initial_action_values=initial_action_values,
489 |                                                                 typical_value=typical_value,
490 |                                                                 yield_frequency=yield_frequency,
491 |                                                                 stop_if_policy_stable=False,)
492 |         
493 |         return results


--------------------------------------------------------------------------------