├── environnements ├── __init__.py ├── oceanEnv.py ├── contextualBanditEnv.py └── nimEnv.py ├── requirements.txt ├── figure ├── nim_env.png ├── bandit_env.png ├── env_render.png ├── ocean_env.jpeg ├── slide_intro.png ├── slide_example.png ├── streamlit_example.png ├── DP │ ├── policy_iteration.gif │ ├── value_iteration.gif │ ├── q_values_joinBeach_estimated.gif │ ├── q_values_leaveBeach_estimated.gif │ ├── v_values_joinBeach_estimated.gif │ ├── v_values_leaveBeach_estimated.gif │ ├── q_values_swim_randomly_estimated.gif │ └── v_values_swim_randomly_estimated.gif ├── MC │ ├── MC_Control_eps_greedy.gif │ ├── q_values_joinBeach_estimated.gif │ ├── q_values_leaveBeach_estimated.gif │ ├── v_values_joinBeach_estimated.gif │ ├── v_values_leaveBeach_estimated.gif │ ├── q_values_swim_randomly_estimated.gif │ └── v_values_swim_randomly_estimated.gif └── TD │ ├── SARSA_Control_eps_greedy.gif │ ├── q_values_joinBeach_estimated.gif │ ├── q_values_leaveBeach_estimated.gif │ ├── v_values_joinBeach_estimated.gif │ ├── v_values_leaveBeach_estimated.gif │ ├── q_values_swim_randomly_estimated.gif │ └── v_values_swim_randomly_estimated.gif ├── .gitmodules ├── RL course EN v2022.pdf ├── RL course FR v2022.pdf ├── .github └── workflows │ └── sync_to_hf.yml ├── src ├── policies.py └── utils.py ├── DP ├── example_PI.py ├── example_IPE_leaveBeach.py ├── example_IPE_joinBeach.py ├── example_IPE_swimRandomly.py ├── plot_control_figures.py ├── plot_prediction_figures.py └── dynamicProgramming.py ├── .gitignore ├── TD ├── example_TD_control.py ├── plot_control_figures.py ├── example_TD_prediction.py ├── plot_prediction_figures.py └── TDLearning.py ├── MC ├── example_MC_control.py ├── plot_control_figures.py ├── example_MC_prediction.py └── plot_prediction_figures.py ├── README.md ├── playground_app ├── mappings.py └── playground.py └── streamlit_app.py /environnements/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gym 2 | numpy 3 | streamlit 4 | plotly==5.9.0 5 | altair==4.0.0 -------------------------------------------------------------------------------- /figure/nim_env.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/nim_env.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "gridworld_rl"] 2 | path = gridworld_rl 3 | url = git@github.com:tboulet/gridworld_rl.git 4 | -------------------------------------------------------------------------------- /figure/bandit_env.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/bandit_env.png -------------------------------------------------------------------------------- /figure/env_render.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/env_render.png -------------------------------------------------------------------------------- /figure/ocean_env.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/ocean_env.jpeg -------------------------------------------------------------------------------- /RL course EN v2022.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/RL course EN v2022.pdf -------------------------------------------------------------------------------- /RL course FR v2022.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/RL course FR v2022.pdf -------------------------------------------------------------------------------- /figure/slide_intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/slide_intro.png -------------------------------------------------------------------------------- /figure/slide_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/slide_example.png -------------------------------------------------------------------------------- /figure/streamlit_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/streamlit_example.png -------------------------------------------------------------------------------- /figure/DP/policy_iteration.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/policy_iteration.gif -------------------------------------------------------------------------------- /figure/DP/value_iteration.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/value_iteration.gif -------------------------------------------------------------------------------- /figure/MC/MC_Control_eps_greedy.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/MC_Control_eps_greedy.gif -------------------------------------------------------------------------------- /figure/TD/SARSA_Control_eps_greedy.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/SARSA_Control_eps_greedy.gif -------------------------------------------------------------------------------- /figure/DP/q_values_joinBeach_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/q_values_joinBeach_estimated.gif -------------------------------------------------------------------------------- /figure/DP/q_values_leaveBeach_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/q_values_leaveBeach_estimated.gif -------------------------------------------------------------------------------- /figure/DP/v_values_joinBeach_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/v_values_joinBeach_estimated.gif -------------------------------------------------------------------------------- /figure/DP/v_values_leaveBeach_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/v_values_leaveBeach_estimated.gif -------------------------------------------------------------------------------- /figure/MC/q_values_joinBeach_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/q_values_joinBeach_estimated.gif -------------------------------------------------------------------------------- /figure/MC/q_values_leaveBeach_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/q_values_leaveBeach_estimated.gif -------------------------------------------------------------------------------- /figure/MC/v_values_joinBeach_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/v_values_joinBeach_estimated.gif -------------------------------------------------------------------------------- /figure/MC/v_values_leaveBeach_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/v_values_leaveBeach_estimated.gif -------------------------------------------------------------------------------- /figure/TD/q_values_joinBeach_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/q_values_joinBeach_estimated.gif -------------------------------------------------------------------------------- /figure/TD/q_values_leaveBeach_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/q_values_leaveBeach_estimated.gif -------------------------------------------------------------------------------- /figure/TD/v_values_joinBeach_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/v_values_joinBeach_estimated.gif -------------------------------------------------------------------------------- /figure/TD/v_values_leaveBeach_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/v_values_leaveBeach_estimated.gif -------------------------------------------------------------------------------- /figure/DP/q_values_swim_randomly_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/q_values_swim_randomly_estimated.gif -------------------------------------------------------------------------------- /figure/DP/v_values_swim_randomly_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/DP/v_values_swim_randomly_estimated.gif -------------------------------------------------------------------------------- /figure/MC/q_values_swim_randomly_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/q_values_swim_randomly_estimated.gif -------------------------------------------------------------------------------- /figure/MC/v_values_swim_randomly_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/MC/v_values_swim_randomly_estimated.gif -------------------------------------------------------------------------------- /figure/TD/q_values_swim_randomly_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/q_values_swim_randomly_estimated.gif -------------------------------------------------------------------------------- /figure/TD/v_values_swim_randomly_estimated.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tboulet/Formation-Reinforcement-Learning/HEAD/figure/TD/v_values_swim_randomly_estimated.gif -------------------------------------------------------------------------------- /.github/workflows/sync_to_hf.yml: -------------------------------------------------------------------------------- 1 | name: Sync to Hugging Face hub 2 | on: 3 | push: 4 | branches: [main] 5 | 6 | # to run this workflow manually from the Actions tab 7 | workflow_dispatch: 8 | 9 | jobs: 10 | sync-to-hub: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | with: 15 | fetch-depth: 0 16 | - name: Push to hub 17 | env: 18 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 19 | run: git push --force https://tboulet:$HF_TOKEN@huggingface.co/spaces/tboulet/RL-Playground main 20 | -------------------------------------------------------------------------------- /src/policies.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Policy: pass 4 | class PolicyForDiscreteState(Policy): pass 5 | 6 | class DiscretePolicyForDiscreteState(PolicyForDiscreteState): 7 | def __init__(self, probs : np.ndarray): 8 | self.probs = probs 9 | self.n_states, self.n_actions = probs.shape 10 | """ 11 | Example for 2 state and 4 actions. 12 | >>> probs = np.array([[0.1, 0.1, 0.7, 0.1], [0.7, 0.1, 0.2, 0.]]) 13 | >>> policy = DiscretePolicyForDiscreteState(probs) 14 | >>> state = 0 15 | >>> action = 0 16 | >>> prob_to_do_action_in_state = policy.get_prob(state, action) 17 | """ 18 | 19 | def get_prob(self, state : int, action : int) -> float: 20 | return self.probs[state, action] 21 | 22 | -------------------------------------------------------------------------------- /DP/example_PI.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from src.utils import * 4 | from environnements.oceanEnv import transition_probability_ocean, reward_probability_ocean 5 | from DP.dynamicProgramming import PolicyIteration 6 | 7 | algo_IP = PolicyIteration() 8 | 9 | print("\nFinding optimal policy...") 10 | ideal_policy, action_values = algo_IP.find_optimal_policy(transition_probability_ocean, 11 | reward_probability_ocean, 12 | gamma=.98, 13 | n_iterations=5, 14 | verbose=1, 15 | return_action_values=True, 16 | ) 17 | print("Optimal policy:", ideal_policy.probs) 18 | print("Final action values:", action_values) 19 | 20 | print("\nPolicy during the learning:") 21 | src.policies_and_actions = algo_IP.find_optimal_policy_yielding( transition_probability_ocean, 22 | reward_probability_ocean, 23 | gamma=.98, 24 | n_iterations=5, 25 | return_action_values=True, 26 | ) 27 | for elem in src.policies_and_actions: 28 | print(elem) 29 | -------------------------------------------------------------------------------- /environnements/oceanEnv.py: -------------------------------------------------------------------------------- 1 | from src.utils import * 2 | import gym 3 | from gym import spaces 4 | 5 | class OceanEnv(gym.Env): 6 | 7 | def __init__(self): 8 | self.action_space = spaces.Discrete(2) 9 | self.observation_space = spaces.Discrete(11) 10 | super().__init__() 11 | 12 | def reset(self) -> Observation: 13 | self.state = 10 14 | return self.state 15 | 16 | def step(self, action: Action) -> Tuple[Observation, Reward, bool]: 17 | assert action == 0 or action == 1, "Action must be in {0, 1} for the OceanEnv environnement." 18 | assert 1 <= self.state <= 10, "The agent should be between 1 and 10 meters when step is called." 19 | 20 | # Action has effect on the environment 21 | if action == 0: 22 | self.state -= 1 23 | elif action == 1: 24 | self.state += 1 25 | if self.state > 10: self.state = 10 26 | 27 | # Compute reward 28 | reward = -1 29 | 30 | # Check if env is terminated 31 | done = self.state <= 0 32 | 33 | return self.state, reward, done, {} 34 | 35 | 36 | 37 | 38 | 39 | def render(self): 40 | print(f"Agent is at {self.state} meters of the beach.") 41 | 42 | 43 | 44 | import numpy as np 45 | transition_probability_ocean = np.array([[[0 for _ in range(11)] for _ in range(2)] for _ in range(11)]) 46 | reward_probability_ocean = np.array([[0 for _ in range(2)] for _ in range(11)]) 47 | env = OceanEnv() 48 | for state in range(1, 11): 49 | for action in [0, 1]: 50 | env.state = state 51 | next_state, reward, done, info = env.step(action) 52 | transition_probability_ocean[state, action, next_state] = 1 53 | reward_probability_ocean[state, action] = reward 54 | 55 | if __name__ == "__main__": 56 | print("Transition probability:", transition_probability_ocean) 57 | print("Reward probability:", reward_probability_ocean) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv* 2 | test.py 3 | prep/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | -------------------------------------------------------------------------------- /TD/example_TD_control.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from src.utils import * 4 | from environnements.oceanEnv import env 5 | from TD.TDLearning import SARSA 6 | 7 | algo_SARSA = SARSA() 8 | 9 | print("\nFinding optimal policy...") 10 | optimal_policy, action_values = algo_SARSA.find_optimal_policy( env = env, 11 | gamma=.98, 12 | n_episodes = 200, 13 | n_steps = float("inf"), 14 | exploration_method='epsilon_greedy', 15 | epsilon=.1, 16 | alpha=.5, 17 | timelimit=40, 18 | return_action_values=True, 19 | initial_action_values="random", 20 | typical_value=-10, 21 | is_state_done=lambda state: state == 0, 22 | verbose=1, 23 | ) 24 | print("Optimal policy's probs:", optimal_policy.probs) 25 | print("Final action values:", action_values) 26 | 27 | print("\nActions and action values during the learning:") 28 | for elem in algo_SARSA.find_optimal_policy_yielding(env = env, 29 | gamma=.98, 30 | n_episodes = 10, 31 | n_steps = float("inf"), 32 | exploration_method='epsilon_greedy', 33 | epsilon=.1, 34 | alpha=.5, 35 | timelimit=40, 36 | return_action_values=True, 37 | initial_action_values="random", 38 | typical_value=-10, 39 | is_state_done=lambda state: state == 0, 40 | yielding_frequency="episode", 41 | ): 42 | print(elem) 43 | -------------------------------------------------------------------------------- /MC/example_MC_control.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from src.utils import * 4 | from environnements.oceanEnv import OceanEnv 5 | from MC.monteCarlo import MonteCarlo 6 | 7 | algo_MC = MonteCarlo() 8 | n_iterations = 10 9 | 10 | print("\nFinding optimal policy...") 11 | optimal_policy, action_values = algo_MC.find_optimal_policy(env = OceanEnv(), 12 | gamma=.98, 13 | n_iterations=n_iterations, 14 | evaluation_episodes=100, 15 | exploration_method='epsilon_greedy', 16 | epsilon=.1, 17 | visit_method="first_visit", 18 | averaging_method="moving", 19 | alpha=.1, 20 | timelimit=40, 21 | return_action_values=True, 22 | initial_action_values="random", 23 | typical_value=-10, 24 | is_state_done=lambda state: state == 0, 25 | verbose=1, 26 | ) 27 | print("Optimal policy's probs:", optimal_policy.probs) 28 | print("Final action values:", action_values) 29 | 30 | print("\nActions and action values during the learning:") 31 | for elem in algo_MC.find_optimal_policy_yielding( env = OceanEnv(), 32 | gamma=.98, 33 | n_iterations=2, 34 | evaluation_episodes=3, 35 | exploration_method='epsilon_greedy', 36 | epsilon=.1, 37 | visit_method="first_visit", 38 | averaging_method="moving", 39 | alpha=.1, 40 | timelimit=40, 41 | initial_action_values="optimistic", 42 | typical_value=-10, 43 | is_state_done=lambda state: state == 0, 44 | yield_frequency="episode", 45 | ): 46 | print(elem) 47 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Callable, Union 2 | import numpy as np 3 | 4 | class Observation: pass 5 | class Action: pass 6 | class Reward: pass 7 | 8 | class RL_algorithm: pass 9 | 10 | class Q_State: 11 | """A Q_State is a tuple of (observation, action)""" 12 | def __init__(self, observation: Observation, action: Action): 13 | self.observation = observation 14 | self.action = action 15 | def __hash__(self): 16 | return hash((self.observation, self.action)) 17 | def __eq__(self, other): 18 | return self.observation == other.observation and self.action == other.action 19 | def __str__(self): 20 | return f"({self.observation}, {self.action})" 21 | 22 | class Scheduler(Callable): 23 | """A Scheduler is a callable that given a number of episode or steps, returns the value of an hyper-parameter (learning rate, epsilon) to apply.""" 24 | def __init__(self, unit): 25 | if not unit in ["episodes", "steps"]: 26 | raise ValueError("Scheduler unit must be either 'episodes' or 'steps'") 27 | self.unit = unit 28 | super().__init__() 29 | def __call__(self, timestep: Union[int, None], episode : Union[int, None]): 30 | raise NotImplementedError("Scheduler must be implemented") 31 | 32 | def pretty_announcer(string): 33 | return "\n==========================================================\n" \ 34 | + string \ 35 | + "\n==========================================================\n" 36 | 37 | 38 | def initialize_values( 39 | shape : Tuple, 40 | initial_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array 41 | typical_value : float = 1, 42 | ) -> np.ndarray: 43 | """This method initialize the state or action values and return it. 44 | shape : the shape of the values 45 | initial_values : the initial values 46 | typical_value : the typical value for the action values, used for scaling the "random" and "optimistic" value-initialization methods. 47 | """ 48 | 49 | 50 | if type(initial_values) == str: 51 | if initial_values == "random": 52 | values = np.random.normal(loc = 0, scale = abs(typical_value), size = shape) 53 | elif initial_values == "zeros": 54 | values = np.zeros(shape) 55 | elif initial_values == "optimistic": # Optimistic initialization is a trick that consist to overestimate the action values initially. This increase exploration for the greedy algorithms. 56 | optimistic_value = 2 * typical_value if typical_value > 0 else typical_value / 2 57 | values = np.ones(shape) * optimistic_value # An order of the magnitude of the reward is used to initialize optimistically the action values. 58 | else: 59 | raise ValueError("The initial action values must be either 'random', 'zeros', 'optimistic' or a numpy array.") 60 | elif isinstance(initial_values, np.ndarray): 61 | values = initial_values 62 | else: 63 | raise ValueError("The initial action values must be either 'random', 'zeros', 'optimistic' or a numpy array.") 64 | 65 | return values -------------------------------------------------------------------------------- /TD/plot_control_figures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from matplotlib.animation import FuncAnimation 4 | 5 | from src.utils import * 6 | from environnements.oceanEnv import OceanEnv 7 | from TD.TDLearning import SARSA 8 | from src.policies import DiscretePolicyForDiscreteState 9 | 10 | algo_SARSA = SARSA() 11 | 12 | S = np.arange(0,11) 13 | n_episodes = 50 14 | fps = 30 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | ### ====================================================================================================================== ### 23 | ### ============================================ Eps Greedy ============================================================== ### 24 | ### ====================================================================================================================== ### 25 | 26 | ### Plot the action values estimated through training 27 | src.policies_and_actions = algo_SARSA.find_optimal_policy_yielding(env = OceanEnv(), 28 | gamma=.98, 29 | n_episodes = n_episodes, 30 | n_steps = float("inf"), 31 | exploration_method='epsilon_greedy', 32 | epsilon=.1, 33 | alpha=.5, 34 | timelimit=40, 35 | return_action_values=True, 36 | initial_action_values="random", 37 | typical_value=-1, 38 | is_state_done=lambda state: state == 0, 39 | yielding_frequency="step", 40 | ) 41 | 42 | 43 | results = [e.copy() if type(e) == np.ndarray else e for e in src.policies_and_actions] 44 | 45 | bact = 4 46 | fig, ax = plt.subplots() 47 | ax.set_xlim(-1, 11) 48 | ax.set_ylim(-20, bact + 2) 49 | ax.set_xlabel("s") 50 | title = "Algorithm starting" 51 | 52 | actions_join, =ax.plot(S[results[0] == 0], [bact] * (len(S)-np.sum(results[0])), "r") 54 | qvalues_closer, = ax.plot(S, results[1][:, 0], ".g", label = "Q(s,<)") 55 | qvalues_far, = ax.plot(S, results[1][:, 1], "xr", label = "Q(s,>)") 56 | ax.legend() 57 | 58 | def update(n): 59 | data = results[n] 60 | if type(data) == str: 61 | ax.set_title(data) 62 | elif type(data) == np.ndarray: 63 | if len(data.shape) == 1: 64 | actions_join.set_data(S[data == 0], [bact] * (len(S)-np.sum(data))) 65 | actions_leave.set_data(S[data == 1], [bact] * np.sum(data)) 66 | elif len(data.shape) == 2: 67 | qvalues_closer.set_ydata(data[:, 0]) 68 | qvalues_far.set_ydata(data[:, 1]) 69 | 70 | anim = FuncAnimation( fig = fig, 71 | func = update, 72 | repeat = True, 73 | frames = np.arange(len(results)), 74 | interval = 20) 75 | 76 | anim.save("figure/TD/SARSA_Control_eps_greedy.gif", writer = "ffmpeg", fps = fps) 77 | plt.show() -------------------------------------------------------------------------------- /DP/example_IPE_leaveBeach.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from src.utils import * 4 | from environnements.oceanEnv import transition_probability_ocean, reward_probability_ocean 5 | from DP.dynamicProgramming import IterativePolicyEvaluation 6 | from src.policies import DiscretePolicyForDiscreteState 7 | 8 | policy_leave_beach = DiscretePolicyForDiscreteState(probs = np.array([[0, 1] for _ in range(11)])) 9 | 10 | algo_IPE = IterativePolicyEvaluation() 11 | 12 | print("\nComputing state values for the policy join_beach...") 13 | estimated_state_values = algo_IPE.find_state_values(policy = policy_leave_beach, 14 | transition_probability = transition_probability_ocean, 15 | reward_probability = reward_probability_ocean, 16 | n_iterations = 5, 17 | maximal_error = 0.01, 18 | gamma=0.99) 19 | print("Estimated state values :", estimated_state_values) 20 | 21 | print("\nEstimated state values during the learning:") 22 | estimated_state_values_during_training = algo_IPE.find_state_values_yielding( policy = policy_leave_beach, 23 | transition_probability = transition_probability_ocean, 24 | reward_probability = reward_probability_ocean, 25 | n_iterations = 12, 26 | maximal_error = 0.01, 27 | gamma = 1) 28 | for n_iter, estimated_state_values in enumerate(estimated_state_values_during_training): 29 | print(f"Iteration {n_iter} :", estimated_state_values) 30 | 31 | print("\nComputing action values for the policy join_beach...") 32 | estimated_action_values = algo_IPE.find_action_values( policy = policy_leave_beach, 33 | transition_probability=transition_probability_ocean, 34 | reward_probability=reward_probability_ocean, 35 | n_iterations=12, 36 | maximal_error=0.01, 37 | gamma=1) 38 | print("Estimated action values :", estimated_action_values) 39 | 40 | print("\nEstimated action values during the learning:") 41 | estimated_action_values_during_training = algo_IPE.find_action_values_yielding( policy = policy_leave_beach, 42 | transition_probability = transition_probability_ocean, 43 | reward_probability = reward_probability_ocean, 44 | n_iterations = 12, 45 | maximal_error = 0.01, 46 | gamma = 1) 47 | for n_iter, estimated_action_values in enumerate(estimated_action_values_during_training): 48 | print(f"Iteration {n_iter} :", estimated_action_values) -------------------------------------------------------------------------------- /DP/example_IPE_joinBeach.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from src.utils import * 4 | from environnements.oceanEnv import transition_probability_ocean, reward_probability_ocean 5 | from DP.dynamicProgramming import IterativePolicyEvaluation 6 | from src.policies import DiscretePolicyForDiscreteState 7 | 8 | policy_join_beach = DiscretePolicyForDiscreteState(probs = np.array([[1, 0] for _ in range(11)])) 9 | 10 | algo_IPE = IterativePolicyEvaluation() 11 | 12 | print("\nComputing state values for the policy join_beach...") 13 | estimated_state_values = algo_IPE.find_state_values(policy = policy_join_beach, 14 | transition_probability = transition_probability_ocean, 15 | reward_probability = reward_probability_ocean, 16 | n_iterations = 5, 17 | maximal_error = 0.01, 18 | gamma=1,) 19 | print("Estimated state values :", estimated_state_values) 20 | 21 | print("\nEstimated state values during the learning:") 22 | estimated_state_values_during_training = algo_IPE.find_state_values_yielding( policy = policy_join_beach, 23 | transition_probability = transition_probability_ocean, 24 | reward_probability = reward_probability_ocean, 25 | n_iterations = 12, 26 | maximal_error = 0.01, 27 | gamma = 1, 28 | ) 29 | for n_iter, estimated_state_values in enumerate(estimated_state_values_during_training): 30 | print(estimated_state_values) 31 | 32 | print("\nComputing action values for the policy join_beach...") 33 | estimated_action_values = algo_IPE.find_action_values( policy = policy_join_beach, 34 | transition_probability=transition_probability_ocean, 35 | reward_probability=reward_probability_ocean, 36 | n_iterations=12, 37 | maximal_error=0.01, 38 | gamma=1) 39 | print("Estimated action values :", estimated_action_values) 40 | 41 | print("\nEstimated action values during the learning:") 42 | estimated_action_values_during_training = algo_IPE.find_action_values_yielding( policy = policy_join_beach, 43 | transition_probability = transition_probability_ocean, 44 | reward_probability = reward_probability_ocean, 45 | n_iterations = 12, 46 | maximal_error = 0.01, 47 | gamma = 1) 48 | for n_iter, estimated_action_values in enumerate(estimated_action_values_during_training): 49 | print(estimated_action_values) -------------------------------------------------------------------------------- /DP/example_IPE_swimRandomly.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from src.utils import * 4 | from environnements.oceanEnv import transition_probability_ocean, reward_probability_ocean 5 | from DP.dynamicProgramming import IterativePolicyEvaluation 6 | from src.policies import DiscretePolicyForDiscreteState 7 | 8 | policy_swim_randomly = DiscretePolicyForDiscreteState(probs = np.array([[0.8, 0.2] for _ in range(11)])) 9 | 10 | 11 | algo_IPE = IterativePolicyEvaluation() 12 | 13 | print("\nComputing state values for the policy swim_randomly...") 14 | estimated_state_values = algo_IPE.find_state_values(policy = policy_swim_randomly, 15 | transition_probability = transition_probability_ocean, 16 | reward_probability = reward_probability_ocean, 17 | n_iterations = 100, 18 | maximal_error = 0.01, 19 | gamma=.98) 20 | print("Estimated state values :", estimated_state_values) 21 | 22 | print("\nEstimated state values during the learning:") 23 | estimated_state_values_during_training = algo_IPE.find_state_values_yielding( policy = policy_swim_randomly, 24 | transition_probability = transition_probability_ocean, 25 | reward_probability = reward_probability_ocean, 26 | n_iterations = 1, 27 | maximal_error = 0.01, 28 | gamma = .98) 29 | for n_iter, estimated_state_values in enumerate(estimated_state_values_during_training): 30 | print(f"Iteration {n_iter} :", estimated_state_values) 31 | 32 | print("\nComputing action values for the policy swim_randomly...") 33 | estimated_action_values = algo_IPE.find_action_values( policy = policy_swim_randomly, 34 | transition_probability=transition_probability_ocean, 35 | reward_probability=reward_probability_ocean, 36 | n_iterations=100, 37 | maximal_error=0.01, 38 | gamma=.98) 39 | print("Estimated action values :", estimated_action_values) 40 | 41 | print("\nEstimated action values during the learning:") 42 | estimated_action_values_during_training = algo_IPE.find_action_values_yielding( policy = policy_swim_randomly, 43 | transition_probability = transition_probability_ocean, 44 | reward_probability = reward_probability_ocean, 45 | n_iterations = 1, 46 | maximal_error = 0.01, 47 | gamma = .98) 48 | for n_iter, estimated_action_values in enumerate(estimated_action_values_during_training): 49 | print(f"Iteration {n_iter} :", estimated_action_values) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Formation-Reinforcement-Learning 2 | This is the repository for the Reinforcement Learning course at Automatants, the AI student association of CentraleSupélec. The course was given to students of the CentraleSupélec campus as an introduction to Reinforcement Learning. 3 | 4 |

5 | 6 |

7 | 8 | Concepts covered in the first part (slides 1-39): 9 | - RL Framework (Environment (with examples), MDP, Policy, Cumulative reward, State and Action Value) 10 | - Environnement shaping (Reward shaping, State shaping, Action shaping) 11 | - Prediction and Control problems 12 | - Model-based methods : Dynamic Programming (Bellman Equations, Policy Iteration, Value Iteration) 13 | 14 | Concepts covered in the second part (slides 40-80): 15 | - Model-free methods : Monte Carlo, TD Learning (SARSA, Q-Learning, Expected SARSA), n-step TD Learning 16 | - Exploration-Exploitation Dilemma 17 | - Exploration Replay 18 | - Deep RL introduction 19 | - Deep Q Network (DQN) 20 | - Parallelization in RL 21 | - Librairies and ressources in RL 22 | 23 | Policy-based RL methods and Importance Sampling are also covered in the slides (81 - 88), but not in the lectures. 24 | 25 | 26 | 27 | 28 | # Videos 29 | 30 | Videos of the lectures are available (in French only) on the [Automatants Youtube channel](https://www.youtube.com/channel/UCZ2wKX6bJg9Yz9KdHkzjw1Q). 31 | 32 | Part 1: Introduction to Reinforcement Learning and Model-based methods (RL Framework, Bellman Equations, Dynamic Programming) 33 | 34 | - [Lecture 1: Introduction to Reinforcement Learning](https://www.youtube.com/watch?v=juNSptzWTJs) 35 | 36 | Part 2: Model-free methods and deeper concepts in RL : Monte Carlo, TD Learning (SARSA, Q-Learning, Expected SARSA), Exploration-Exploitation Dilemma, Off-Policy Learning, Deep RL intro 37 | 38 | - [Lecture 2: Deeper concepts in Reinforcement Learning](https://www.youtube.com/watch?v=LId8UpG_YY4) 39 | 40 | # Slides 41 | 42 | Slides of the lectures are available in this repository in French and English in the as powerpoint files "slides ENGLISH.pptx" and "slides FR.pptx". 43 | 44 |

45 | 46 |

47 | 48 | 49 | # Gridworld environment 50 | 51 |

52 | Q values through training 53 |

54 | 55 | The Gridworld environment is available [here](https://github.com/tboulet/gridworld_rl). It was a simple gridworld environment developped to implement the algorithms seen in the lectures. The goal was to visualize Q values or probabilities of actions during the training of the agent. Several environments/grids (with different rewards, obstacles, etc.) and several agents (including your own) are available. More information on the GitHub repository. 56 | 57 | # Streamlit app 58 | 59 | You can visualize the results of the algorithms seen in the lectures and the influence of many hyperparameters with the Streamlit app. 60 | 61 | This include 3 environnements : OceanEnv (reach the goal as fast as possible), Nim (take the last stick) and a Contextual Bandit environment (choose the best arm at each state). 62 |

63 | 64 |

65 | 66 | The app is deployed with Streamlit and should be available [here](https://share.streamlit.io/tboulet/formation-reinforcement-learning/main/app.py). 67 | 68 | If that is not the case, you can still install streamlit with pip and then run the app locally with the following command: 69 | ```bash 70 | streamlit run streamlit_app.py 71 | ``` -------------------------------------------------------------------------------- /environnements/contextualBanditEnv.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from src.utils import * 4 | import gym 5 | from gym import spaces 6 | 7 | class ContextualBanditEnv(gym.Env): 8 | n_states = 4 9 | n_actions = n_states 10 | is_terminal = True # True if 1 episode= 1step, False for a non terminal episode 11 | 12 | means = [k for k in range(n_states)] 13 | stds = [k+1 for k in range(n_states)] 14 | 15 | time_limit = float("inf") # careful, if not +oo, may lead to strange behavior perhaps for TD 16 | 17 | def __init__(self): 18 | # Define gym spaces 19 | self.action_space = spaces.Discrete(self.n_actions) 20 | self.observation_space = spaces.Discrete(self.n_states + int(self.is_terminal)) 21 | super().__init__() 22 | 23 | def reset(self) -> Observation: 24 | # Define initial state 25 | self.state = self.random_state() 26 | return self.state 27 | 28 | def step(self, action : Action) -> Tuple[Observation, Reward, bool]: 29 | # Check if action is valid (between 0 and num_rm - 1). 30 | assert action in range(self.n_actions), "Action must be in {0, 1, ..., self.n_actions - 1} for the contextualBanditEnv environnement." 31 | 32 | # Action has effect on the environment 33 | k = (self.state - action - 1) % self.n_states 34 | mean, std = self.means[k], self.stds[k] 35 | reward = np.random.normal(mean, std) 36 | if self.is_terminal: 37 | done = True 38 | self.state = self.n_states 39 | else: 40 | done = False 41 | self.state = self.random_state() 42 | 43 | return self.state, reward, done, {} 44 | 45 | def random_state(self): 46 | return random.choice(range(self.n_states)) 47 | 48 | def render(self, **kwargs): 49 | pass 50 | 51 | # For non terminal bandit problem 52 | if ContextualBanditEnv.is_terminal: 53 | transition_probability_CB = np.array([[[0 for _ in range(ContextualBanditEnv.n_states + 1)] for _ in range(ContextualBanditEnv.n_actions)] for _ in range(ContextualBanditEnv.n_states + 1)], dtype = float) 54 | for state in range(ContextualBanditEnv.n_states): 55 | for action in range(ContextualBanditEnv.n_actions): 56 | transition_probability_CB[state][action][ContextualBanditEnv.n_states] = 1 57 | reward_probability_CB = np.array([[0 for _ in range(ContextualBanditEnv.n_actions)] for _ in range(ContextualBanditEnv.n_states + 1)], dtype = float) 58 | for state in range(ContextualBanditEnv.n_states): 59 | for action in range(ContextualBanditEnv.n_actions): 60 | k = (state - action - 1) % ContextualBanditEnv.n_states 61 | reward_probability_CB[state][action] = ContextualBanditEnv.means[k] 62 | # For terminal bandit problem (1 episode = 1 step) 63 | else: 64 | transition_probability_CB = np.array([[[0 for _ in range(ContextualBanditEnv.n_states)] for _ in range(ContextualBanditEnv.n_actions)] for _ in range(ContextualBanditEnv.n_states)], dtype = float) 65 | reward_probability_CB = np.array([[0 for _ in range(ContextualBanditEnv.n_actions)] for _ in range(ContextualBanditEnv.n_states)], dtype = float) 66 | for state in range(ContextualBanditEnv.n_states): 67 | for action in range(ContextualBanditEnv.n_actions): 68 | k = (state - action - 1) % ContextualBanditEnv.n_states 69 | reward_probability_CB[state][action] = ContextualBanditEnv.means[k] 70 | 71 | 72 | 73 | if __name__ == "__main__": 74 | print("Transition probability for state 0 P(0,a,s'):", transition_probability_CB[0]) 75 | print("Reward probability:", reward_probability_CB) -------------------------------------------------------------------------------- /MC/plot_control_figures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from matplotlib.animation import FuncAnimation 4 | 5 | from src.utils import * 6 | from environnements.oceanEnv import OceanEnv 7 | from MC.monteCarlo import MonteCarlo 8 | from src.policies import DiscretePolicyForDiscreteState 9 | 10 | algo_MC = MonteCarlo() 11 | 12 | n_iterations = 8 13 | n_iterations_evaluation = 40 14 | S = np.arange(0,11) 15 | fps = 30 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | ### ====================================================================================================================== ### 24 | ### ============================================ Eps Greedy ============================================================== ### 25 | ### ====================================================================================================================== ### 26 | 27 | ### Plot the action values estimated through training 28 | policies_and_actions = algo_MC.find_optimal_policy_yielding( env = OceanEnv(), 29 | gamma=.98, 30 | n_iterations=n_iterations, 31 | evaluation_episodes=n_iterations_evaluation, 32 | exploration_method='epsilon_greedy', 33 | epsilon=.1, 34 | visit_method="first_visit", 35 | averaging_method="moving", 36 | alpha=.1, 37 | timelimit=40, 38 | initial_action_values="random", 39 | typical_value=-10, 40 | is_state_done=lambda state: state == 0, 41 | ) 42 | 43 | 44 | results = [e.copy() if type(e) == np.ndarray else e for e in policies_and_actions] 45 | 46 | bact = 4 47 | fig, ax = plt.subplots() 48 | ax.set_xlim(-1, 11) 49 | ax.set_ylim(-20, bact + 2) 50 | ax.set_xlabel("s") 51 | title_control = f"MC Control : 0/{n_iterations}" 52 | title_prediction = f"MC Prediction : 0/{n_iterations_evaluation}" 53 | 54 | actions_join, =ax.plot(S[results[0] == 0], [bact] * (len(S)-np.sum(results[0])), "r") 56 | qvalues_closer, = ax.plot(S, results[1][:, 0], ".g", label = "Q(s,<)") 57 | qvalues_far, = ax.plot(S, results[1][:, 1], "xr", label = "Q(s,>)") 58 | ax.legend() 59 | 60 | def update(n): 61 | global title_control, title_prediction 62 | if n>= len(results): 63 | ax.set_title("MC Control (ended)") 64 | return 65 | data = results[n] 66 | if type(data) == str: 67 | if "MC Control" in data: 68 | title_control = data 69 | ax.set_title(title_control + " - " + title_prediction) 70 | elif "MC Prediction" in data: 71 | title_prediction = data 72 | ax.set_title(title_control + " - " + title_prediction) 73 | elif type(data) == np.ndarray: 74 | if len(data.shape) == 1: 75 | actions_join.set_data(S[data == 0], [bact] * (len(S)-np.sum(data))) 76 | actions_leave.set_data(S[data == 1], [bact] * np.sum(data)) 77 | elif len(data.shape) == 2: 78 | qvalues_closer.set_ydata(data[:, 0]) 79 | qvalues_far.set_ydata(data[:, 1]) 80 | 81 | anim = FuncAnimation( fig = fig, 82 | func = update, 83 | repeat = True, 84 | frames = np.arange(len(results)), 85 | interval = 20) 86 | plt.show() 87 | anim.save("figure/MC/MC_Control_eps_greedy.gif", writer = "ffmpeg", fps = 30) 88 | -------------------------------------------------------------------------------- /environnements/nimEnv.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from src.utils import * 4 | import gym 5 | from gym import spaces 6 | 7 | class NimEnv(gym.Env): 8 | num_initial_objects = 21 # Number of objects initially in the pile 9 | num_rm = 3 # Maximal number of objects removable each turn 10 | 11 | def __init__(self): 12 | # Define gym spaces 13 | self.action_space = spaces.Discrete(self.num_rm) 14 | self.observation_space = spaces.Discrete(self.num_initial_objects + 1) 15 | super().__init__() 16 | 17 | def reset(self) -> Observation: 18 | # Define initial state 19 | self.num_objects = self.num_initial_objects 20 | return self.num_objects 21 | 22 | def step(self, action) -> Tuple[Observation, Reward, bool]: 23 | # Check if action is valid (between 0 and num_rm - 1). 24 | assert self.num_objects > 0, "The game should not be finished when step() is called." 25 | assert action in range(self.num_rm), "Action must be in {0, 1, ..., self.num_rm - 1} for the nimEnv environnement." 26 | action += 1 27 | # Action has effect on the environment 28 | self.num_objects -= action 29 | # Compute reward and done 30 | if self.num_objects <= 0: 31 | reward = -1 32 | done = True 33 | self.num_objects = 0 34 | else: 35 | action_opponent = self.opponent_act(state = self.num_objects) 36 | self.num_objects -= action_opponent 37 | if self.num_objects <= 0: 38 | reward = 1 39 | done = True 40 | self.num_objects = 0 41 | else: 42 | reward = 0 43 | done = False 44 | # Return observation, reward, done, and info 45 | return self.num_objects, reward, done, {} 46 | 47 | def opponent_act(self, state : Observation = None) -> Action: 48 | # Choose action according to opponent policy (uniformly random) 49 | action = random.choice(range(self.num_rm)) + 1 50 | return action 51 | 52 | def render(self, **kwargs): 53 | print(f"{self.num_objects}/{self.num_initial_objects} objects remaining.") 54 | 55 | 56 | import numpy as np 57 | n_states = NimEnv.num_initial_objects + 1 58 | n_actions = NimEnv.num_rm 59 | transition_probability_nim = np.array([[[0 for _ in range(n_states)] for _ in range(n_actions)] for _ in range(n_states)], dtype = float) 60 | reward_probability_nim = np.array([[0 for _ in range(n_actions)] for _ in range(n_states)], dtype = float) 61 | env = NimEnv() 62 | for state in range(1, n_states): 63 | for action in range(n_actions): 64 | num_objects_removed = action + 1 65 | num_objects_remaining = state - num_objects_removed 66 | 67 | # Here the agent failed and remove the last object, reaching state 0 and receiving a reward of -1 68 | if num_objects_remaining <= 0: 69 | reward_probability_nim[state, action] = -1 70 | transition_probability_nim[state, action, 0] = 1 71 | # Here the agent did not remove the last object, and the opponent may remove it. 72 | else: 73 | prob = 1 / n_actions 74 | for action_opponent in range(n_actions): 75 | num_objects_removed_opponent = action_opponent + 1 76 | num_objects_remaining_opponent = num_objects_remaining - num_objects_removed_opponent 77 | # Here the agent did not remove the last object, and the opponent did not remove it. 78 | if num_objects_remaining_opponent > 0: 79 | transition_probability_nim[state, action, num_objects_remaining_opponent] = prob 80 | # Here the agent did not remove the last object, and the opponent removed the last object. 81 | # The agent receives a reward of 1 and reaches state 0. 82 | else: 83 | reward_probability_nim[state, action] += prob 84 | transition_probability_nim[state, action, 0] += prob 85 | 86 | if __name__ == "__main__": 87 | print("Transition probability for state 5 P(5,a,s'):", transition_probability_nim[5]) 88 | print("Reward probability:", reward_probability_nim) -------------------------------------------------------------------------------- /playground_app/mappings.py: -------------------------------------------------------------------------------- 1 | from MC.monteCarlo import MonteCarlo 2 | from DP.dynamicProgramming import IterativePolicyEvaluation, PolicyIteration, ValueIteration 3 | from TD.TDLearning import TD, SARSA 4 | 5 | from environnements.oceanEnv import OceanEnv, transition_probability_ocean, reward_probability_ocean 6 | from environnements.nimEnv import NimEnv, transition_probability_nim, reward_probability_nim 7 | from environnements.contextualBanditEnv import ContextualBanditEnv, transition_probability_CB, reward_probability_CB 8 | 9 | map_name_to_algo = {"IterativePolicyEvaluation": { "Algo": IterativePolicyEvaluation, 10 | "family": "DP"}, 11 | "PolicyIteration": {"Algo": PolicyIteration, 12 | "family": "DP"}, 13 | "ValueIteration": {"Algo": ValueIteration, 14 | "family": "DP"}, 15 | "MonteCarlo": { "Algo": MonteCarlo, 16 | "family": "MC"}, 17 | "TD(0)": { "Algo": TD, 18 | "family": "TD"}, 19 | "SARSA" : {"Algo" : SARSA, 20 | "family": "TD"}, 21 | 22 | } 23 | 24 | map_name_to_env = { "Ocean Env": { "Env" : OceanEnv, 25 | "model" : (transition_probability_ocean, reward_probability_ocean), 26 | "is_state_done" : lambda state : state == 0, 27 | "range_values" : [-20, 5], 28 | "image_path" : "figure/ocean_env.jpeg", 29 | "description" : "In this environment you need to reach the beach as fast as possible. \ 30 | You start in the ocean and you can only move in the 2 directions. \ 31 | The state consist of the distance with the beach and is represented by an integer between 0 and 10 \ 32 | (you can't go more far than 10). The reward is -1 at each step and 0 when you reach the beach. \ 33 | The episode ends when you reach the beach. \ 34 | ", 35 | }, 36 | 37 | "Nim's Game" : { "Env" : NimEnv, 38 | "model" : (transition_probability_nim, reward_probability_nim), 39 | "is_state_done" : lambda state : state <= 0, 40 | "range_values" : [-2, 2], 41 | "image_path" : "figure/nim_env.png", 42 | "description" : "In this game you start with 10 matches and you can remove 1, 2 or 3 matches at each step (those are your actions). The player that removes the last match loses. You play against a random agent. The state consist of the number of matches left and is represented by an integer between 0 and n_matches=25. The reward is 1 if you win, -1 if you lose and 0 if the game is not finished. The episode ends when the game is finished." 43 | }, 44 | 45 | "n-Bandit Contextual" : { "Env" : ContextualBanditEnv, 46 | "model" : (transition_probability_CB, reward_probability_CB), 47 | "is_state_done" : lambda state : state == -1, 48 | "range_values" : [-1, 4], 49 | "image_path" : "figure/bandit_env.png", 50 | "description" : "In this famous environment, which is a foundation problem of theoretical RL, you have a slot machine with 4 arms. Each arm ill give you a reward following a random law that you don't now. This is contextual because which arm is better depends on the state. In particular here, the expected reward is r(s,a) = (s-a-1)%4 so the optimal action for each state is pi*(s)=s.", 51 | }, 52 | 53 | } 54 | 55 | map_problem_to_algo_names = { "Prediction Problem" : ["MonteCarlo", "IterativePolicyEvaluation", "TD(0)", "SARSA"], 56 | "Control Problem" : ["MonteCarlo", "PolicyIteration", "ValueIteration", "SARSA"], 57 | } -------------------------------------------------------------------------------- /TD/example_TD_prediction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from src.utils import * 4 | from environnements.oceanEnv import OceanEnv 5 | from TD.TDLearning import TD, SARSA 6 | from src.policies import DiscretePolicyForDiscreteState 7 | 8 | policy_swim_randomly = DiscretePolicyForDiscreteState(probs = np.array([[0.8, 0.2] for _ in range(11)])) 9 | 10 | algo_TD = TD() 11 | algo_SARSA = SARSA() 12 | 13 | print("\nComputing state values for the policy policy_swim_randomly...") 14 | estimated_state_values = algo_TD.find_state_values( policy = policy_swim_randomly, 15 | env = OceanEnv(), 16 | n_episodes = 100, 17 | n_steps = 100000, 18 | gamma=0.99, 19 | alpha=0.5, 20 | timelimit=40, 21 | initial_state_values="random", 22 | typical_value = -5, 23 | exploring_starts=False, 24 | is_state_done=lambda state: state == 0, 25 | verbose=0, 26 | ) 27 | print("Estimated state values :", estimated_state_values) 28 | 29 | print("\nEstimated state values during the learning:") 30 | estimated_state_values_during_training = algo_TD.find_state_values_yielding(policy = policy_swim_randomly, 31 | env = OceanEnv(), 32 | n_episodes = 50, 33 | n_steps = float("inf"), 34 | gamma=0.99, 35 | alpha=0.1, 36 | timelimit=40, 37 | initial_state_values="random", 38 | typical_value = -5, 39 | exploring_starts=False, 40 | is_state_done=lambda state: state == 0, 41 | 42 | yield_frequency="episode", 43 | ) 44 | for estimated_state_values in estimated_state_values_during_training: 45 | print(estimated_state_values) 46 | 47 | print("\nComputing action values for the policy policy_swim_randomly...") 48 | estimated_action_values = algo_SARSA.find_action_values( policy = policy_swim_randomly, 49 | env = OceanEnv(), 50 | n_episodes = 100, 51 | n_steps = float("inf"), 52 | gamma=0.99, 53 | alpha=0.1, 54 | timelimit=40, 55 | initial_action_values="random", 56 | typical_value = -5, 57 | exploring_starts=False, 58 | is_state_done=lambda state: state == 0, 59 | verbose=0, 60 | ) 61 | print("Estimated action values :", estimated_action_values) 62 | 63 | print("\nEstimated action values during the learning:") 64 | estimated_action_values_during_training = algo_TD.find_action_values_yielding( policy = policy_swim_randomly, 65 | env = OceanEnv(), 66 | n_episodes = 1, 67 | n_steps = 10, 68 | gamma=0.99, 69 | alpha=0.1, 70 | timelimit=40, 71 | initial_action_values="random", 72 | typical_value = -5, 73 | exploring_starts=False, 74 | is_state_done=lambda state: state == 0, 75 | 76 | yield_frequency="step", 77 | ) 78 | for estimated_action_values in estimated_action_values_during_training: 79 | print(estimated_action_values) -------------------------------------------------------------------------------- /MC/example_MC_prediction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from src.utils import * 4 | from environnements.oceanEnv import OceanEnv 5 | from MC.monteCarlo import MonteCarlo 6 | from src.policies import DiscretePolicyForDiscreteState 7 | 8 | policy_swim_randomly = DiscretePolicyForDiscreteState(probs = np.array([[0.8, 0.2] for _ in range(11)])) 9 | 10 | algo_MC = MonteCarlo() 11 | 12 | print("\nComputing state values for the policy policy_swim_randomly...") 13 | estimated_state_values = algo_MC.find_state_values( policy = policy_swim_randomly, 14 | env = OceanEnv(), 15 | n_episodes = 10, 16 | gamma=0.98, 17 | visit_method="first_visit", 18 | averaging_method="moving", 19 | alpha=0.1, 20 | timelimit=40, 21 | initial_state_values="random", 22 | typical_value = -5, 23 | exploring_starts=False, 24 | is_state_done=lambda state: state == 0, 25 | verbose=1, 26 | ) 27 | print("Estimated state values :", estimated_state_values) 28 | 29 | print("\nEstimated state values during the learning:") 30 | estimated_state_values_during_training = algo_MC.find_state_values_yielding(policy = policy_swim_randomly, 31 | env = OceanEnv(), 32 | n_episodes = 2, 33 | gamma=0.98, 34 | visit_method="first_visit", 35 | averaging_method="moving", 36 | alpha=0.1, 37 | timelimit=40, 38 | initial_state_values="random", 39 | typical_value = -5, 40 | exploring_starts=False, 41 | is_state_done=lambda state: state == 0, 42 | ) 43 | for estimated_state_values in estimated_state_values_during_training: 44 | print(estimated_state_values) 45 | 46 | print("\nComputing action values for the policy policy_swim_randomly...") 47 | estimated_action_values = algo_MC.find_action_values( policy = policy_swim_randomly, 48 | env = OceanEnv(), 49 | n_episodes=10, 50 | gamma=0.98, 51 | visit_method="first_visit", 52 | averaging_method="moving", 53 | alpha=0.05, 54 | timelimit=40, 55 | initial_action_values="random", 56 | typical_value=-10, 57 | exploring_starts=False, 58 | is_state_done=lambda state: state == 0, 59 | ) 60 | print("Estimated action values :", estimated_action_values) 61 | 62 | print("\nEstimated action values during the learning:") 63 | estimated_action_values_during_training = algo_MC.find_action_values_yielding( policy = policy_swim_randomly, 64 | env = OceanEnv(), 65 | n_episodes=2, 66 | gamma=0.98, 67 | visit_method="first_visit", 68 | averaging_method="moving", 69 | alpha=0.05, 70 | hotimelimitrizon=40, 71 | initial_action_values="random", 72 | typical_value=-10, 73 | exploring_starts=False, 74 | is_done_states=lambda state: state == 0, 75 | ) 76 | for estimated_action_values in estimated_action_values_during_training: 77 | print(estimated_action_values) -------------------------------------------------------------------------------- /playground_app/playground.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import numpy as np 3 | import pandas as pd 4 | import plotly.express as px 5 | import plotly.graph_objects as go 6 | from src.utils import * 7 | 8 | def run_rl_algorithm(**config): 9 | """Run an algorithm and display result on streamlit. 10 | """ 11 | st.header("Results of training:") 12 | if config["family"] == "MC": 13 | config["yield_frequency"] = st.selectbox("Display a frame each... (higher frequency slow down displaying)", ["step", "episode", "iteration"], index=1) 14 | elif config["family"] == "DP": 15 | config["yield_frequency"] = st.selectbox("Display a frame each... (higher frequency slow down displaying)", ["step", "iteration", "global_iteration"], index=1) 16 | elif config["family"] == "TD": 17 | config["yield_frequency"] = st.selectbox("Display a frame each... (higher frequency slow down displaying)", ["step", "episode"], index=1) 18 | else: 19 | raise ValueError("Unknown family: {}".format(config["family"])) 20 | 21 | # Generate for the good problem with the good algo and for the specified config 22 | algo = config["algo"] 23 | problem = config["problem"] 24 | try: 25 | if problem == "Prediction Problem": 26 | values_type = config["values_type"] 27 | if values_type == "State values V": 28 | datas = algo.find_state_values_yielding(**config) 29 | elif values_type == "Action values Q": 30 | datas = algo.find_action_values_yielding(**config) 31 | elif problem == "Control Problem": 32 | datas = algo.find_optimal_policy_yielding(**config) 33 | except AttributeError: 34 | raise ValueError(f"Algorithm {config['algo_name']} does not work for finding the specified values (if Prediction Problem) or finding the optimal policy. Please change problem or values kind.") 35 | 36 | #Treat this data 37 | title = "Algo starting" 38 | title_control = "" 39 | title_prediction = "" 40 | 41 | num_frame = 0 42 | frame_titles = dict() 43 | datas_list = list() 44 | env = config["env"] 45 | n_states, n_actions = env.observation_space.n, env.action_space.n 46 | 47 | greedy_actions = None 48 | a, b = config["range_values"] 49 | y_greedy_actions = 0.9 * b + 0.1 * a 50 | for data in datas: 51 | # If the data is a string, modify the title of the next frames. 52 | if type(data) == str: 53 | if "Prediction" in data: 54 | title_prediction = data 55 | elif "Control" in data: 56 | title_control = data 57 | title = title_control + " | " + title_prediction 58 | 59 | # If the data is an array, it can either be a Q(s,a), V(s) or greedy_actions(s). We are building a new frame. 60 | elif type(data) == np.ndarray: 61 | # Save the title of the frame. We will apply this title later. 62 | frame_titles[num_frame] = title 63 | # Add plot of actions. 64 | if greedy_actions is not None: 65 | for state in range(n_states): 66 | datas_list.append([num_frame, state, greedy_actions[state], y_greedy_actions]) 67 | # Add plot of Q values 68 | if len(data.shape) == 2: # Q values 69 | for state in range(n_states): 70 | for action in range(n_actions): 71 | datas_list.append([num_frame, state, action, data[state, action]]) 72 | # Add plot of V values or update greedy_actions depending of the nature of the problem which define the type of 1-dimensionnaly shaped data returned (V or actions). 73 | elif len(data.shape) == 1: # 74 | if problem == "Prediction Problem": #V values 75 | for state in range(n_states): 76 | datas_list.append([num_frame, state, -1, data[state]]) 77 | elif problem == "Control Problem": # greedy actions 78 | greedy_actions = data 79 | 80 | else: 81 | raise ValueError("data must be either a string or a numpy array") 82 | num_frame += 1 83 | #Create df and plotly figure : we plot the value in function of the state, and the time-axis is defined as frame. We group data by action to distinguish Q(s,a) for different a. 84 | df = pd.DataFrame(datas_list, columns=["frame", "state", "action", "values"]) 85 | range_states = [-1, env.observation_space.n] 86 | fig = px.scatter(df, x = "state", 87 | y = "values", 88 | color = "action", # if values_type == "Action values Q" else None, 89 | animation_frame="frame", 90 | range_x=range_states, range_y=config["range_values"]) 91 | 92 | #This is for animated title for an animation (only way kekw) 93 | if len(fig.layout.updatemenus) == 0: raise ValueError("Likely cause of this error : The frequency for frame doesn't make sense for this algorithm, please change.") 94 | for button in fig.layout.updatemenus[0].buttons: 95 | button['args'][1]['frame']['redraw'] = True 96 | for k in range(len(fig.frames)): 97 | fig.frames[k]['layout'].update(title_text=frame_titles[k]) 98 | 99 | 100 | # fig.add_trace(go.Scatter( 101 | # x=range_states, 102 | # y=[y_greedy_actions] * len(range_states), 103 | # name="", 104 | # )) 105 | 106 | #Display the figure 107 | if st.checkbox("Display training"): 108 | st.plotly_chart(fig) 109 | if greedy_actions is not None: 110 | st.write(f"The points that stays at y={y_greedy_actions} represents the greedy action. They are those chosen by the agent in the case of a greedy policy.") -------------------------------------------------------------------------------- /streamlit_app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import numpy as np 3 | 4 | from src.utils import * 5 | from src.policies import DiscretePolicyForDiscreteState 6 | 7 | from playground_app.playground import * 8 | from playground_app.mappings import map_name_to_algo, map_name_to_env, map_problem_to_algo_names 9 | 10 | st.title("Reinforcement Learning Playground") 11 | config = {} 12 | 13 | # Input 1 : env and problem type 14 | st.sidebar.header("Problem") 15 | env_name = st.sidebar.selectbox("Environment", map_name_to_env.keys()) 16 | st.sidebar.caption(map_name_to_env[env_name]["description"]) 17 | problem = st.sidebar.selectbox("Problem", ["Prediction Problem", "Control Problem"]) 18 | 19 | env_dict = map_name_to_env[env_name] 20 | Pssa, Rsa = env_dict["model"] 21 | env = env_dict["Env"]() 22 | env_description = env_dict["description"] 23 | env_image_path = env_dict["image_path"] 24 | config["env"] = env 25 | config["transition_probability"] = Pssa 26 | config["reward_probability"] = Rsa 27 | config["range_values"] = env_dict["range_values"] 28 | config["problem"] = problem 29 | 30 | st.header(f"Environment : {env_name}") 31 | st.caption(env_description) 32 | st.image(env_image_path) 33 | 34 | 35 | if problem == "Prediction Problem": 36 | # Input 2 : policy to evaluate, value type and algo 37 | st.header("Algorithm used") 38 | 39 | algo_name = st.selectbox("Algorithm", map_problem_to_algo_names["Prediction Problem"]) 40 | Algo = map_name_to_algo[algo_name]["Algo"] 41 | 42 | values_type = st.selectbox("Values to estimate", ["State values V", "Action values Q"]) 43 | 44 | n_actions = env.action_space.n 45 | action_probs = list() 46 | st.caption("Policy to evaluate: (will be normalized). This playground can only evaluate blind policy (non dependant on states).") 47 | for action in range(n_actions): 48 | action_probs.append(st.slider(f"Action {action}", 0, 100, value=50)) 49 | action_probs = np.array(action_probs) / np.sum(action_probs) 50 | probs = np.array([action_probs for _ in range(env.observation_space.n)]) 51 | policy = DiscretePolicyForDiscreteState(probs = probs) 52 | 53 | 54 | 55 | config["policy"] = policy 56 | config["algo_name"] = algo_name 57 | config["algo"] = Algo() 58 | config["family"] = map_name_to_algo[algo_name]["family"] 59 | config["values_type"] = values_type 60 | 61 | elif problem == "Control Problem": 62 | # Input 2 : algo 63 | st.header("Algorithm used") 64 | 65 | algo_name = st.selectbox("Algorithm", map_problem_to_algo_names["Control Problem"]) 66 | Algo = map_name_to_algo[algo_name]["Algo"] 67 | 68 | config["algo_name"] = algo_name 69 | config["algo"] = Algo() 70 | config["family"] = map_name_to_algo[algo_name]["family"] 71 | 72 | 73 | # Input 3 : Problem-related parameters 74 | st.header("Hyperparameters") 75 | col_problem, col_algo = st.columns(2) 76 | with col_problem: 77 | if problem == "Prediction Problem": 78 | st.subheader("Prediction problem:") 79 | if map_name_to_algo[algo_name]["family"] == "MC": # n_episode 80 | config["n_episodes"] = st.number_input("Number of episodes", value=20) 81 | config["exploring_starts"] = st.checkbox("Exploring starts", value=False) # exploring_starts 82 | if config["exploring_starts"]: config["is_state_done"] = map_name_to_env[env_name]["is_state_done"] 83 | elif map_name_to_algo[algo_name]["family"] == "TD": 84 | pass 85 | elif map_name_to_algo[algo_name]["family"] == "DP": # n_iterations 86 | config["n_iterations"] = st.number_input("Number of iterations", value=20) 87 | 88 | elif problem == "Control Problem": 89 | st.subheader("Control problem:") 90 | if map_name_to_algo[algo_name]["family"] == "MC": 91 | config["n_iterations"] = st.number_input("Number of iterations", value=10) 92 | config["evaluation_episodes"] = st.number_input("Number of episodes at each evaluation of the policy", value=50) 93 | if map_name_to_algo[algo_name]["family"] == "TD": 94 | pass 95 | if map_name_to_algo[algo_name]["family"] == "DP": 96 | config["n_iterations"] = st.number_input("Number of iterations", value=10) 97 | # Input 4 : Algorithm-related parameters 98 | with col_algo: 99 | if map_name_to_algo[algo_name]["family"] == "MC": 100 | st.subheader("Monte Carlo:") 101 | config["visit_method"] = st.selectbox("Visit method", ["first_visit"]) 102 | config["averaging_method"] = st.selectbox("Averaging method", ["cumulative", "moving"]) 103 | config["alpha"] = st.slider("Learning rate", 0.0, 1.0, value=0.1) 104 | if problem == "Prediction Problem": 105 | pass 106 | elif problem == "Control Problem": 107 | config["exploration_method"] = st.selectbox("Exploration method", ["epsilon_greedy", "greedy", "exploring_starts"]) 108 | if config["exploration_method"] == "epsilon_greedy": 109 | config["epsilon"] = st.slider("Epsilon", 0., 1., value=0.1) 110 | 111 | if map_name_to_algo[algo_name]["family"] == "TD": 112 | st.subheader("TD Learning:") 113 | config["n_episodes"] = st.number_input("Maximal duration in episodes", value=20) 114 | config["n_steps"] = st.number_input("Maximal duration in steps", value=30*20) 115 | config["alpha"] = st.slider("Learning rate", 0.0, 1.0, value=0.1) 116 | 117 | if map_name_to_algo[algo_name]["family"] == "DP": 118 | st.subheader("Dynamic Programming:") 119 | st.write("Criterium for convergence of DP algorithms:") 120 | maximal_error = st.number_input("Error threshold for convergence", value=0.01) 121 | config["maximal_error"] = maximal_error 122 | config["IPE_maximal_error"] = maximal_error 123 | config["sweep_order"] = st.selectbox("Sweep order for states", ["normal", "reverse", "random"]) 124 | if problem == "Prediction Problem": 125 | pass 126 | elif problem == "Control Problem": 127 | config["IPE_n_iterations"] = st.number_input("Number of iterations for the IPE algorithm", value=20) 128 | 129 | # Input 4 : Hyperparameters 130 | with st.sidebar: 131 | st.header("Environnement hyperparameters") 132 | config["gamma"] = st.number_input("Discount factor", value=0.95) 133 | config["timelimit"] = st.slider("Time limit", 0, 100, value=40) 134 | initial_values = st.selectbox("Initial values", ["zeros", "random", "optimistic"]) 135 | config["initial_state_values"] = initial_values 136 | config["initial_action_values"] = initial_values 137 | config["typical_value"] = st.number_input("Typical value (in magnitude order)", value=1) 138 | 139 | # Output : compute values and display 140 | run_rl_algorithm(**config) -------------------------------------------------------------------------------- /DP/plot_control_figures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from matplotlib.animation import FuncAnimation 4 | 5 | from src.utils import * 6 | from environnements.oceanEnv import transition_probability_ocean, reward_probability_ocean 7 | from DP.dynamicProgramming import PolicyIteration, ValueIteration 8 | 9 | n_iterations = 10 10 | S = np.arange(0,11) 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | ### ====================================================================================================================== ### 20 | ### ============================================ Policy Iteration ======================================================== ### 21 | ### ====================================================================================================================== ### 22 | 23 | algo_PI = PolicyIteration() 24 | 25 | ### Plot the state values estimated through training 26 | src.policies_and_actions = algo_PI.find_optimal_policy_yielding(transition_probability=transition_probability_ocean, 27 | reward_probability=reward_probability_ocean, 28 | gamma=.98, 29 | n_iterations=8, 30 | IPE_n_iterations=5, 31 | IPE_threshold=.05, 32 | sweep_order="random", 33 | initial_action_values="random", 34 | typical_value=-1, 35 | yield_frequency="step", 36 | ) 37 | 38 | 39 | results = [e.copy() if type(e) == np.ndarray else e for e in src.policies_and_actions] 40 | 41 | bact = 4 42 | fig, ax = plt.subplots() 43 | ax.set_xlim(-1, 11) 44 | ax.set_ylim(-20, bact + 2) 45 | ax.set_xlabel("s") 46 | title_control = f"DP Control (PI or VI) - Iteration 0" 47 | title_prediction = f"DP Prediction of Q (IPE) - Iteration 0" 48 | 49 | # actions, = ax.plot(S, results[1] + bact, ".b", label = "Actions") 50 | actions_join, =ax.plot(S[results[0] == 0], [bact] * (len(S)-np.sum(results[0])), "r") 52 | qvalues_closer, = ax.plot(S, results[1][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach") 53 | qvalues_far, = ax.plot(S, results[1][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach") 54 | ax.legend() 55 | 56 | def update(n): 57 | global title_control, title_prediction 58 | if n>= len(results): 59 | ax.set_title("Policy Iteration (ended)") 60 | return 61 | data = results[n] 62 | if type(data) == str: 63 | if "Control" in data: 64 | title_control = data 65 | ax.set_title(title_control + " | " + title_prediction) 66 | elif "Prediction" in data: 67 | title_prediction = data 68 | ax.set_title(title_control + " | " + title_prediction) 69 | elif type(data) == np.ndarray: 70 | if len(data.shape) == 1: 71 | actions_join.set_data(S[data == 0], [bact] * (len(S)-np.sum(data))) 72 | actions_leave.set_data(S[data == 1], [bact] * np.sum(data)) 73 | elif len(data.shape) == 2: 74 | qvalues_closer.set_ydata(data[:, 0]) 75 | qvalues_far.set_ydata(data[:, 1]) 76 | 77 | anim = FuncAnimation( fig = fig, 78 | func = update, 79 | repeat = True, 80 | frames = np.arange(2, len(results)), 81 | interval = 100) 82 | 83 | plt.show() 84 | anim.save("figure/DP/policy_iteration.gif", writer = "ffmpeg", fps = 30) 85 | 86 | 87 | 88 | 89 | 90 | 91 | ### ====================================================================================================================== ### 92 | ### ============================================ Value Iteration ========================================================= ### 93 | ### ====================================================================================================================== ### 94 | 95 | algo_VI = ValueIteration() 96 | 97 | ### Plot the state values estimated through training 98 | src.policies_and_actions = algo_VI.find_optimal_policy_yielding(transition_probability=transition_probability_ocean, 99 | reward_probability=reward_probability_ocean, 100 | gamma=.98, 101 | n_iterations=15, 102 | sweep_order="random", 103 | initial_action_values="random", 104 | typical_value=-1, 105 | yield_frequency="step", 106 | ) 107 | 108 | 109 | results = [e.copy() if type(e) == np.ndarray else e for e in src.policies_and_actions] 110 | 111 | bact = 4 112 | fig, ax = plt.subplots() 113 | ax.set_xlim(-1, 11) 114 | ax.set_ylim(-20, bact + 2) 115 | ax.set_xlabel("s") 116 | title_control = f"DP Control (PI or VI) - Iteration 0" 117 | title_prediction = f"DP Prediction of Q (IPE) - Iteration 0" 118 | 119 | # actions, = ax.plot(S, results[1] + bact, ".b", label = "Actions") 120 | actions_join, =ax.plot(S[results[0] == 0], [bact] * (len(S)-np.sum(results[0])), "r") 122 | qvalues_closer, = ax.plot(S, results[1][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach") 123 | qvalues_far, = ax.plot(S, results[1][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach") 124 | ax.legend() 125 | 126 | def update(n): 127 | global title_control, title_prediction 128 | if n>= len(results): 129 | ax.set_title("Value Iteration (ended)") 130 | return 131 | data = results[n] 132 | if type(data) == str: 133 | if "Control" in data: 134 | title_control = data 135 | ax.set_title(title_control + " | " + title_prediction) 136 | elif "Prediction" in data: 137 | title_prediction = data 138 | ax.set_title(title_control + " | " + title_prediction) 139 | elif type(data) == np.ndarray: 140 | if len(data.shape) == 1: 141 | actions_join.set_data(S[data == 0], [bact] * (len(S)-np.sum(data))) 142 | actions_leave.set_data(S[data == 1], [bact] * np.sum(data)) 143 | elif len(data.shape) == 2: 144 | qvalues_closer.set_ydata(data[:, 0]) 145 | qvalues_far.set_ydata(data[:, 1]) 146 | 147 | anim = FuncAnimation( fig = fig, 148 | func = update, 149 | repeat = True, 150 | frames = np.arange(2, len(results)), 151 | interval = 100) 152 | 153 | anim.save("figure/DP/value_iteration.gif", writer = "ffmpeg", fps = 30) 154 | plt.show() 155 | -------------------------------------------------------------------------------- /DP/plot_prediction_figures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from matplotlib.animation import FuncAnimation 4 | 5 | from src.utils import * 6 | from environnements.oceanEnv import transition_probability_ocean, reward_probability_ocean 7 | from DP.dynamicProgramming import IterativePolicyEvaluation 8 | from src.policies import DiscretePolicyForDiscreteState 9 | 10 | 11 | algo_IPE = IterativePolicyEvaluation() 12 | 13 | n_iterations = 15 14 | S = np.arange(0,11) 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | ### ====================================================================================================================== ### 24 | policy_join_beach = DiscretePolicyForDiscreteState(probs = np.array([[1, 0] for _ in range(11)])) 25 | ### ====================================================================================================================== ### 26 | 27 | 28 | ### Plot the state values estimated through training 29 | estimated_state_values_during_training = algo_IPE.find_state_values_yielding( policy = policy_join_beach, 30 | transition_probability = transition_probability_ocean, 31 | reward_probability = reward_probability_ocean, 32 | n_iterations = n_iterations, 33 | maximal_error = 0.01, 34 | gamma=0.98, 35 | sweep_order="random",) 36 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training] 37 | 38 | 39 | fig, ax = plt.subplots() 40 | ax.set_xlim(-1, 11) 41 | ax.set_ylim(-n_iterations-2, 1) 42 | ax.set_xlabel("s") 43 | ax.set_ylabel("V(s)") 44 | ax.set_title(f"Policy join_beach : Iteration 0") 45 | 46 | 47 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values") 48 | line, = ax.plot(S, -S, "-r", label="True State Values (-s)") 49 | ax.legend() 50 | 51 | def update(n): 52 | data = VS[n] 53 | if type(data) == str: 54 | ax.set_title(f"Policy join_beach : {data}") 55 | elif type(data) == np.ndarray: 56 | points.set_ydata(VS[n]) 57 | 58 | anim = FuncAnimation( fig = fig, 59 | func = update, 60 | repeat = True, 61 | frames = np.arange(0, len(VS)), 62 | interval = 100) 63 | 64 | anim.save("figure/DP/v_values_joinBeach_estimated.gif", writer = "ffmpeg", fps = 2) 65 | plt.show() 66 | 67 | 68 | 69 | 70 | 71 | ### Plot the action values estimated through training 72 | estimated_action_values_during_training = algo_IPE.find_action_values_yielding( policy = policy_join_beach, 73 | transition_probability = transition_probability_ocean, 74 | reward_probability = reward_probability_ocean, 75 | n_iterations = n_iterations, 76 | maximal_error = 0.01, 77 | gamma = 0.98) 78 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training] 79 | 80 | 81 | fig, ax = plt.subplots() 82 | ax.set_xlim(-1, 11) 83 | ax.set_ylim(-n_iterations-2, 1) 84 | ax.set_xlabel("s") 85 | ax.set_ylabel("Q(s, a)") 86 | ax.set_title(f"Policy join_beach : Iteration 0") 87 | 88 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach") 89 | points_get_far, = ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach") 90 | ax.legend() 91 | 92 | def update(n): 93 | data = QSA[n] 94 | if type(data) == str: 95 | ax.set_title(f"Policy join_beach : {data}") 96 | elif type(data) == np.ndarray: 97 | points_get_closer.set_ydata(QSA[n][:, 0]) 98 | points_get_far.set_ydata(QSA[n][:, 1]) 99 | 100 | anim = FuncAnimation( fig = fig, 101 | func = update, 102 | repeat = True, 103 | frames = np.arange(0, len(QSA)), 104 | interval = 100) 105 | 106 | anim.save("figure/DP/q_values_joinBeach_estimated.gif", writer = "ffmpeg", fps = 2) 107 | plt.show() 108 | 109 | 110 | 111 | 112 | ### ====================================================================================================================== ### 113 | policy_leave_beach = DiscretePolicyForDiscreteState(probs = np.array([[0, 1] for _ in range(11)])) 114 | ### ====================================================================================================================== ### 115 | 116 | ### Plot the state values estimated through training 117 | estimated_state_values_during_training = algo_IPE.find_state_values_yielding( policy = policy_leave_beach, 118 | transition_probability = transition_probability_ocean, 119 | reward_probability = reward_probability_ocean, 120 | n_iterations = n_iterations, 121 | maximal_error = 0.01, 122 | gamma=0.8) 123 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training] 124 | 125 | 126 | fig, ax = plt.subplots() 127 | ax.set_xlim(-1, 11) 128 | ax.set_ylim(-n_iterations-2, 1) 129 | ax.set_xlabel("s") 130 | ax.set_ylabel("V(s)") 131 | ax.set_title(f"Policy leave_beach : Iteration 0") 132 | 133 | 134 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values") 135 | ax.legend() 136 | 137 | def update(n): 138 | data = VS[n] 139 | if type(data) == str: 140 | ax.set_title(f"Policy leave_beach : {data}") 141 | elif type(data) == np.ndarray: 142 | points.set_ydata(VS[n]) 143 | 144 | anim = FuncAnimation( fig = fig, 145 | func = update, 146 | repeat = True, 147 | frames = np.arange(0, len(VS)), 148 | interval = 100) 149 | 150 | anim.save("figure/DP/v_values_leaveBeach_estimated.gif", writer = "ffmpeg", fps = 2) 151 | plt.show() 152 | 153 | 154 | 155 | 156 | 157 | 158 | ### Plot the action values estimated through training 159 | estimated_action_values_during_training = algo_IPE.find_action_values_yielding( policy = policy_leave_beach, 160 | transition_probability = transition_probability_ocean, 161 | reward_probability = reward_probability_ocean, 162 | n_iterations = n_iterations, 163 | maximal_error = 0.01, 164 | gamma = 0.8) 165 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training] 166 | 167 | fig, ax = plt.subplots() 168 | ax.set_xlim(-1, 11) 169 | ax.set_ylim(-n_iterations-2, 1) 170 | ax.set_xlabel("s") 171 | ax.set_ylabel("Q(s, a)") 172 | ax.set_title(f"Policy leave_beach : Iteration 0") 173 | 174 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach") 175 | points_get_far, = ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach") 176 | ax.legend() 177 | 178 | def update(n): 179 | data = QSA[n] 180 | if type(data) == str: 181 | ax.set_title(f"Policy leave_beach : {data}") 182 | elif type(data) == np.ndarray: 183 | points_get_closer.set_ydata(QSA[n][:, 0]) 184 | points_get_far.set_ydata(QSA[n][:, 1]) 185 | 186 | anim = FuncAnimation( fig = fig, 187 | func = update, 188 | repeat = True, 189 | frames = np.arange(0, len(QSA)), 190 | interval = 100) 191 | 192 | anim.save("figure/DP/q_values_leaveBeach_estimated.gif", writer = "ffmpeg", fps = 2) 193 | plt.show() 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | ### ====================================================================================================================== ### 202 | policy_swim_randomly = DiscretePolicyForDiscreteState(probs = np.array([[0.8, 0.2] for _ in range(11)])) 203 | ### ====================================================================================================================== ### 204 | 205 | ### Plot the state values estimated through training 206 | estimated_state_values_during_training = algo_IPE.find_state_values_yielding( policy = policy_swim_randomly, 207 | transition_probability = transition_probability_ocean, 208 | reward_probability = reward_probability_ocean, 209 | n_iterations = n_iterations, 210 | maximal_error = 0.01, 211 | gamma=0.98) 212 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training] 213 | 214 | 215 | fig, ax = plt.subplots() 216 | ax.set_xlim(-1, 11) 217 | ax.set_ylim(-n_iterations-2, 1) 218 | ax.set_xlabel("s") 219 | ax.set_ylabel("V(s)") 220 | ax.set_title(f"Policy swim_randomly : Iteration 0") 221 | 222 | 223 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values") 224 | ax.legend() 225 | 226 | def update(n): 227 | data = VS[n] 228 | if type(data) == str: 229 | ax.set_title(f"Policy swim_randomly : {data}") 230 | elif type(data) == np.ndarray: 231 | points.set_ydata(VS[n]) 232 | 233 | anim = FuncAnimation( fig = fig, 234 | func = update, 235 | repeat = True, 236 | frames = np.arange(0, len(VS)), 237 | interval = 100) 238 | 239 | anim.save("figure/DP/v_values_swim_randomly_estimated.gif", writer = "ffmpeg", fps = 2) 240 | plt.show() 241 | 242 | 243 | 244 | 245 | 246 | 247 | ### Plot the action values estimated through training 248 | estimated_action_values_during_training = algo_IPE.find_action_values_yielding( policy = policy_swim_randomly, 249 | transition_probability = transition_probability_ocean, 250 | reward_probability = reward_probability_ocean, 251 | n_iterations = n_iterations, 252 | maximal_error = 0.01, 253 | gamma = 0.98) 254 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training] 255 | 256 | fig, ax = plt.subplots() 257 | ax.set_xlim(-1, 11) 258 | ax.set_ylim(-n_iterations-2, 1) 259 | ax.set_xlabel("s") 260 | ax.set_ylabel("Q(s, a)") 261 | ax.set_title(f"Policy swim_randomly : Iteration 0") 262 | 263 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach") 264 | points_get_far, = ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach") 265 | ax.legend() 266 | 267 | def update(n): 268 | data = QSA[n] 269 | if type(data) == str: 270 | ax.set_title(f"Policy swim_randomly : {data}") 271 | elif type(data) == np.ndarray: 272 | points_get_closer.set_ydata(QSA[n][:, 0]) 273 | points_get_far.set_ydata(QSA[n][:, 1]) 274 | 275 | anim = FuncAnimation( fig = fig, 276 | func = update, 277 | repeat = True, 278 | frames = np.arange(0, len(QSA)), 279 | interval = 100) 280 | 281 | anim.save("figure/DP/q_values_swim_randomly_estimated.gif", writer = "ffmpeg", fps = 2) 282 | plt.show() -------------------------------------------------------------------------------- /MC/plot_prediction_figures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from matplotlib.animation import FuncAnimation 4 | 5 | from src.utils import * 6 | from environnements.oceanEnv import OceanEnv 7 | from MC.monteCarlo import MonteCarlo 8 | from src.policies import DiscretePolicyForDiscreteState 9 | 10 | 11 | algo_MC = MonteCarlo() 12 | 13 | n_episodes = 10 14 | S = np.arange(0,11) 15 | y_low_lim = -20 16 | fps = 30 17 | 18 | 19 | 20 | 21 | 22 | 23 | ### ====================================================================================================================== ### 24 | policy_join_beach = DiscretePolicyForDiscreteState(probs = np.array([[1, 0] for _ in range(11)])) 25 | ### ====================================================================================================================== ### 26 | 27 | 28 | ### Plot the state values estimated through training 29 | estimated_state_values_during_training = algo_MC.find_state_values_yielding(policy = policy_join_beach, 30 | env = OceanEnv(), 31 | n_episodes = 3 * n_episodes, 32 | gamma=0.98, 33 | visit_method="first_visit", 34 | averaging_method="moving", 35 | alpha=0.1, 36 | timelimit=40, 37 | initial_state_values="random", 38 | typical_value = -5, 39 | exploring_starts=False, 40 | is_state_done=lambda state: state == 0, 41 | ) 42 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training] 43 | 44 | 45 | fig, ax = plt.subplots() 46 | ax.set_xlim(-1, 11) 47 | ax.set_ylim(-13, 1) 48 | ax.set_xlabel("s") 49 | ax.set_ylabel("V(s)") 50 | 51 | 52 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values") 53 | line, = ax.plot(S, -S, "-r", label="True State Values (-s)") 54 | ax.legend() 55 | 56 | def update(n): 57 | data = VS[n] 58 | if type(data) == str: 59 | ax.set_title(f"Policy join_beach : {data}") 60 | elif type(data) == np.ndarray: 61 | points.set_ydata(VS[n]) 62 | 63 | anim = FuncAnimation( fig = fig, 64 | func = update, 65 | repeat = True, 66 | frames = np.arange(0, len(VS)), 67 | interval = 30) 68 | 69 | anim.save("figure/MC/v_values_joinBeach_estimated.gif", writer = "ffmpeg", fps = fps) 70 | plt.show() 71 | 72 | 73 | 74 | 75 | 76 | ### Plot the action values estimated through training 77 | estimated_action_values_during_training = algo_MC.find_action_values_yielding( policy = policy_join_beach, 78 | env = OceanEnv(), 79 | n_episodes = 3 * n_episodes, 80 | gamma=0.98, 81 | visit_method="first_visit", 82 | averaging_method="moving", 83 | alpha=0.1, 84 | timelimit=40, 85 | initial_action_values="random", 86 | typical_value = -5, 87 | exploring_starts=False, 88 | is_state_done=lambda state: state == 0, 89 | ) 90 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training] 91 | 92 | 93 | fig, ax = plt.subplots() 94 | ax.set_xlim(-1, 11) 95 | ax.set_ylim(-13, 1) 96 | ax.set_xlabel("s") 97 | ax.set_ylabel("Q(s, a)") 98 | 99 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach") 100 | points_get_far, = ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach") 101 | ax.legend() 102 | 103 | def update(n): 104 | data = QSA[n] 105 | if type(data) == str: 106 | ax.set_title(f"Policy join_beach : {data}") 107 | elif type(data) == np.ndarray: 108 | points_get_closer.set_ydata(QSA[n][:, 0]) 109 | points_get_far.set_ydata(QSA[n][:, 1]) 110 | 111 | 112 | anim = FuncAnimation( fig = fig, 113 | func = update, 114 | repeat = True, 115 | frames = np.arange(0, len(QSA)), 116 | interval = 30) 117 | 118 | anim.save("figure/MC/q_values_joinBeach_estimated.gif", writer = "ffmpeg", fps = fps) 119 | plt.show() 120 | 121 | 122 | 123 | 124 | ### ====================================================================================================================== ### 125 | policy_leave_beach = DiscretePolicyForDiscreteState(probs = np.array([[0, 1] for _ in range(11)])) 126 | ### ====================================================================================================================== ### 127 | 128 | ### Plot the state values estimated through training 129 | estimated_state_values_during_training = algo_MC.find_state_values_yielding(policy = policy_leave_beach, 130 | env = OceanEnv(), 131 | n_episodes = 2 * n_episodes, 132 | gamma=0.8, 133 | visit_method="first_visit", 134 | averaging_method="moving", 135 | alpha=0.1, 136 | timelimit=40, 137 | initial_state_values="random", 138 | typical_value = -5, 139 | exploring_starts=False, 140 | is_state_done=lambda state: state == 0, 141 | ) 142 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training] 143 | 144 | 145 | fig, ax = plt.subplots() 146 | ax.set_xlim(-1, 11) 147 | ax.set_ylim(-13, 1) 148 | ax.set_xlabel("s") 149 | ax.set_ylabel("V(s)") 150 | 151 | 152 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values") 153 | ax.legend() 154 | 155 | def update(n): 156 | data = VS[n] 157 | if type(data) == str: 158 | ax.set_title(f"Policy leave_beach : {data}") 159 | elif type(data) == np.ndarray: 160 | points.set_ydata(VS[n]) 161 | 162 | anim = FuncAnimation( fig = fig, 163 | func = update, 164 | repeat = True, 165 | frames = np.arange(0, len(VS)), 166 | interval = 30) 167 | 168 | anim.save("figure/MC/v_values_leaveBeach_estimated.gif", writer = "ffmpeg", fps = fps) 169 | plt.show() 170 | 171 | 172 | 173 | 174 | 175 | 176 | ### Plot the action values estimated through training 177 | estimated_action_values_during_training = algo_MC.find_action_values_yielding( policy = policy_leave_beach, 178 | env = OceanEnv(), 179 | n_episodes = 2 * n_episodes, 180 | gamma=0.8, 181 | visit_method="first_visit", 182 | averaging_method="moving", 183 | alpha=0.1, 184 | timelimit=40, 185 | initial_action_values="random", 186 | typical_value = -5, 187 | exploring_starts=False, 188 | is_state_done=lambda state: state == 0, 189 | ) 190 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training] 191 | 192 | 193 | fig, ax = plt.subplots() 194 | ax.set_xlim(-1, 11) 195 | ax.set_ylim(-13, 1) 196 | ax.set_xlabel("s") 197 | ax.set_ylabel("Q(s, a)") 198 | ax.set_title(f"Policy leave_beach : Iteration 0/{n_episodes}") 199 | 200 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach") 201 | points_get_far, = ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach") 202 | ax.legend() 203 | 204 | def update(n): 205 | data = QSA[n] 206 | if type(data) == str: 207 | ax.set_title(f"Policy leave_beach : {data}") 208 | elif type(data) == np.ndarray: 209 | points_get_closer.set_ydata(QSA[n][:, 0]) 210 | points_get_far.set_ydata(QSA[n][:, 1]) 211 | 212 | 213 | anim = FuncAnimation( fig = fig, 214 | func = update, 215 | repeat = True, 216 | frames = np.arange(0, len(QSA)), 217 | interval = 100) 218 | 219 | anim.save("figure/MC/q_values_leaveBeach_estimated.gif", writer = "ffmpeg", fps = fps) 220 | plt.show() 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | ### ====================================================================================================================== ### 229 | policy_swim_randomly = DiscretePolicyForDiscreteState(probs = np.array([[0.8, 0.2] for _ in range(11)])) 230 | ### ====================================================================================================================== ### 231 | 232 | ### Plot the state values estimated through training 233 | estimated_state_values_during_training = algo_MC.find_state_values_yielding(policy = policy_swim_randomly, 234 | env = OceanEnv(), 235 | n_episodes = 3 * n_episodes, 236 | gamma=0.98, 237 | visit_method="first_visit", 238 | averaging_method="moving", 239 | alpha=0.1, 240 | timelimit=40, 241 | initial_state_values="random", 242 | typical_value = -5, 243 | exploring_starts=False, 244 | is_state_done=lambda state: state == 0, 245 | ) 246 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training] 247 | 248 | 249 | fig, ax = plt.subplots() 250 | ax.set_xlim(-1, 11) 251 | ax.set_ylim(y_low_lim, 1) 252 | ax.set_xlabel("s") 253 | ax.set_ylabel("V(s)") 254 | 255 | 256 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values") 257 | ax.legend() 258 | 259 | def update(n): 260 | data = VS[n] 261 | if type(data) == str: 262 | ax.set_title(f"Policy swim_randomly : {data}") 263 | elif type(data) == np.ndarray: 264 | points.set_ydata(VS[n]) 265 | 266 | anim = FuncAnimation( fig = fig, 267 | func = update, 268 | repeat = True, 269 | frames = np.arange(0, len(VS)), 270 | interval = 30) 271 | 272 | anim.save("figure/MC/v_values_swim_randomly_estimated.gif", writer = "ffmpeg", fps = fps) 273 | plt.show() 274 | 275 | 276 | 277 | 278 | 279 | 280 | ### Plot the action values estimated through training 281 | estimated_action_values_during_training = algo_MC.find_action_values_yielding( policy = policy_swim_randomly, 282 | env = OceanEnv(), 283 | n_episodes = 4 * n_episodes, 284 | gamma=0.98, 285 | visit_method="first_visit", 286 | averaging_method="moving", 287 | alpha=0.1, 288 | timelimit=40, 289 | initial_action_values="random", 290 | typical_value = -5, 291 | exploring_starts=False, 292 | is_state_done=lambda state: state == 0, 293 | ) 294 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training] 295 | 296 | 297 | fig, ax = plt.subplots() 298 | ax.set_xlim(-1, 11) 299 | ax.set_ylim(y_low_lim, 1) 300 | ax.set_xlabel("s") 301 | ax.set_ylabel("Q(s, a)") 302 | ax.set_title(f"Policy swim_randomly : Iteration 0/{n_episodes}") 303 | 304 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach") 305 | points_get_far, = ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach") 306 | ax.legend() 307 | 308 | def update(n): 309 | data = QSA[n] 310 | if type(data) == str: 311 | ax.set_title(f"Policy swim_randomly : {data}") 312 | elif type(data) == np.ndarray: 313 | points_get_closer.set_ydata(QSA[n][:, 0]) 314 | points_get_far.set_ydata(QSA[n][:, 1]) 315 | 316 | anim = FuncAnimation( fig = fig, 317 | func = update, 318 | repeat = True, 319 | frames = np.arange(0, len(QSA)), 320 | interval = 100) 321 | 322 | anim.save("figure/MC/q_values_swim_randomly_estimated.gif", writer = "ffmpeg", fps = fps) 323 | plt.show() -------------------------------------------------------------------------------- /TD/plot_prediction_figures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from matplotlib.animation import FuncAnimation 4 | 5 | from src.utils import * 6 | from environnements.oceanEnv import OceanEnv 7 | from TD.TDLearning import TD, SARSA 8 | from src.policies import DiscretePolicyForDiscreteState 9 | 10 | 11 | algo_TD = TD() 12 | algo_SARSA = SARSA() 13 | 14 | n_episodes = 30 15 | S = np.arange(0,11) 16 | y_low_lim = -20 17 | fps = 30 18 | 19 | 20 | 21 | 22 | 23 | 24 | ### ====================================================================================================================== ### 25 | policy_join_beach = DiscretePolicyForDiscreteState(probs = np.array([[1, 0] for _ in range(11)])) 26 | ### ====================================================================================================================== ### 27 | 28 | 29 | ### Plot the state values estimated through training 30 | estimated_state_values_during_training = algo_TD.find_state_values_yielding(policy = policy_join_beach, 31 | env = OceanEnv(), 32 | n_episodes = n_episodes, 33 | n_steps = float("inf"), 34 | gamma=0.99, 35 | alpha=0.5, 36 | timelimit=40, 37 | initial_state_values="random", 38 | typical_value = -5, 39 | exploring_starts=False, 40 | is_state_done=lambda state: state == 0, 41 | 42 | yield_frequency="step", 43 | ) 44 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training] 45 | 46 | 47 | fig, ax = plt.subplots() 48 | ax.set_xlim(-1, 11) 49 | ax.set_ylim(-13, 1) 50 | ax.set_xlabel("s") 51 | ax.set_ylabel("V(s)") 52 | 53 | 54 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values") 55 | line, = ax.plot(S, -S, "-r", label="True State Values (-s)") 56 | ax.legend() 57 | 58 | def update(n): 59 | data = VS[n] 60 | if type(data) == str: 61 | ax.set_title(f"Policy join_beach : {data}") 62 | elif type(data) == np.ndarray: 63 | points.set_ydata(VS[n]) 64 | 65 | anim = FuncAnimation( fig = fig, 66 | func = update, 67 | repeat = True, 68 | frames = np.arange(0, len(VS)), 69 | interval = 30) 70 | 71 | anim.save("figure/TD/v_values_joinBeach_estimated.gif", writer = "ffmpeg", fps = fps) 72 | plt.show() 73 | 74 | 75 | 76 | 77 | ### Plot the action values estimated through training 78 | estimated_action_values_during_training = algo_SARSA.find_action_values_yielding( policy = policy_join_beach, 79 | env = OceanEnv(), 80 | n_episodes = n_episodes, 81 | n_steps = float("inf"), 82 | gamma=0.99, 83 | alpha=0.5, 84 | timelimit=40, 85 | initial_action_values="random", 86 | typical_value = -5, 87 | exploring_starts=False, 88 | is_state_done=lambda state: state == 0, 89 | 90 | yield_frequency="step", 91 | ) 92 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training] 93 | 94 | 95 | fig, ax = plt.subplots() 96 | ax.set_xlim(-1, 11) 97 | ax.set_ylim(-13, 1) 98 | ax.set_xlabel("s") 99 | ax.set_ylabel("Q(s, a)") 100 | 101 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach") 102 | points_get_far, = ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach") 103 | ax.legend() 104 | 105 | def update(n): 106 | data = QSA[n] 107 | if type(data) == str: 108 | ax.set_title(f"Policy join_beach : {data}") 109 | elif type(data) == np.ndarray: 110 | points_get_closer.set_ydata(QSA[n][:, 0]) 111 | points_get_far.set_ydata(QSA[n][:, 1]) 112 | 113 | 114 | anim = FuncAnimation( fig = fig, 115 | func = update, 116 | repeat = True, 117 | frames = np.arange(0, len(QSA)), 118 | interval = 30) 119 | 120 | anim.save("figure/TD/q_values_joinBeach_estimated.gif", writer = "ffmpeg", fps = fps) 121 | plt.show() 122 | 123 | 124 | 125 | 126 | ### ====================================================================================================================== ### 127 | policy_leave_beach = DiscretePolicyForDiscreteState(probs = np.array([[0, 1] for _ in range(11)])) 128 | ### ====================================================================================================================== ### 129 | 130 | ### Plot the state values estimated through training 131 | estimated_state_values_during_training = algo_TD.find_state_values_yielding(policy = policy_leave_beach, 132 | env = OceanEnv(), 133 | n_episodes = 5, 134 | n_steps = float("inf"), 135 | gamma=0.8, 136 | alpha=0.5, 137 | timelimit=40, 138 | initial_state_values="random", 139 | typical_value = -5, 140 | exploring_starts=False, 141 | is_state_done=lambda state: state == 0, 142 | 143 | yield_frequency="step", 144 | ) 145 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training] 146 | 147 | 148 | fig, ax = plt.subplots() 149 | ax.set_xlim(-1, 11) 150 | ax.set_ylim(-13, 1) 151 | ax.set_xlabel("s") 152 | ax.set_ylabel("V(s)") 153 | 154 | 155 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values") 156 | ax.legend() 157 | 158 | def update(n): 159 | data = VS[n] 160 | if type(data) == str: 161 | ax.set_title(f"Policy leave_beach : {data}") 162 | elif type(data) == np.ndarray: 163 | points.set_ydata(VS[n]) 164 | 165 | anim = FuncAnimation( fig = fig, 166 | func = update, 167 | repeat = True, 168 | frames = np.arange(0, len(VS)), 169 | interval = 30) 170 | 171 | anim.save("figure/TD/v_values_leaveBeach_estimated.gif", writer = "ffmpeg", fps = fps) 172 | plt.show() 173 | 174 | 175 | 176 | 177 | 178 | 179 | ### Plot the action values estimated through training 180 | estimated_action_values_during_training = algo_SARSA.find_action_values_yielding( policy = policy_leave_beach, 181 | env = OceanEnv(), 182 | n_episodes = 5, 183 | n_steps = float("inf"), 184 | gamma=0.8, 185 | alpha=0.5, 186 | timelimit=40, 187 | initial_action_values="random", 188 | typical_value = -5, 189 | exploring_starts=False, 190 | is_state_done=lambda state: state == 0, 191 | 192 | yield_frequency="step", 193 | ) 194 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training] 195 | 196 | 197 | fig, ax = plt.subplots() 198 | ax.set_xlim(-1, 11) 199 | ax.set_ylim(-13, 1) 200 | ax.set_xlabel("s") 201 | ax.set_ylabel("Q(s, a)") 202 | ax.set_title(f"Policy leave_beach : Iteration 0/{n_episodes}") 203 | 204 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach") 205 | points_get_far, = ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach") 206 | ax.legend() 207 | 208 | def update(n): 209 | data = QSA[n] 210 | if type(data) == str: 211 | ax.set_title(f"Policy leave_beach : {data}") 212 | elif type(data) == np.ndarray: 213 | points_get_closer.set_ydata(QSA[n][:, 0]) 214 | points_get_far.set_ydata(QSA[n][:, 1]) 215 | 216 | 217 | anim = FuncAnimation( fig = fig, 218 | func = update, 219 | repeat = True, 220 | frames = np.arange(0, len(QSA)), 221 | interval = 100) 222 | 223 | anim.save("figure/TD/q_values_leaveBeach_estimated.gif", writer = "ffmpeg", fps = fps) 224 | plt.show() 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | ### ====================================================================================================================== ### 233 | policy_swim_randomly = DiscretePolicyForDiscreteState(probs = np.array([[0.8, 0.2] for _ in range(11)])) 234 | ### ====================================================================================================================== ### 235 | 236 | ### Plot the state values estimated through training 237 | estimated_state_values_during_training = algo_TD.find_state_values_yielding(policy = policy_swim_randomly, 238 | env = OceanEnv(), 239 | n_episodes = n_episodes, 240 | n_steps = float("inf"), 241 | gamma=0.99, 242 | alpha=0.5, 243 | timelimit=40, 244 | initial_state_values="random", 245 | typical_value = -5, 246 | exploring_starts=False, 247 | is_state_done=lambda state: state == 0, 248 | 249 | yield_frequency="step", 250 | ) 251 | VS = [e.copy() if type(e) == np.ndarray else e for e in estimated_state_values_during_training] 252 | 253 | 254 | fig, ax = plt.subplots() 255 | ax.set_xlim(-1, 11) 256 | ax.set_ylim(y_low_lim, 1) 257 | ax.set_xlabel("s") 258 | ax.set_ylabel("V(s)") 259 | 260 | 261 | points, = ax.plot(S, VS[0], ".b", label = "Estimated State Values") 262 | ax.legend() 263 | 264 | def update(n): 265 | data = VS[n] 266 | if type(data) == str: 267 | ax.set_title(f"Policy swim_randomly : {data}") 268 | elif type(data) == np.ndarray: 269 | points.set_ydata(VS[n]) 270 | 271 | anim = FuncAnimation( fig = fig, 272 | func = update, 273 | repeat = True, 274 | frames = np.arange(0, len(VS)), 275 | interval = 30) 276 | 277 | anim.save("figure/TD/v_values_swim_randomly_estimated.gif", writer = "ffmpeg", fps = fps) 278 | plt.show() 279 | 280 | 281 | 282 | 283 | 284 | 285 | ### Plot the action values estimated through training 286 | estimated_action_values_during_training = algo_SARSA.find_action_values_yielding( policy = policy_swim_randomly, 287 | env = OceanEnv(), 288 | n_episodes = n_episodes, 289 | n_steps = float("inf"), 290 | gamma=0.99, 291 | alpha=0.5, 292 | timelimit=40, 293 | initial_action_values="random", 294 | typical_value = -5, 295 | exploring_starts=False, 296 | is_state_done=lambda state: state == 0, 297 | 298 | yield_frequency="step", 299 | ) 300 | QSA = [e.copy() if type(e) == np.ndarray else e for e in estimated_action_values_during_training] 301 | 302 | 303 | fig, ax = plt.subplots() 304 | ax.set_xlim(-1, 11) 305 | ax.set_ylim(y_low_lim, 1) 306 | ax.set_xlabel("s") 307 | ax.set_ylabel("Q(s, a)") 308 | ax.set_title(f"Policy swim_randomly : Iteration 0/{n_episodes}") 309 | 310 | points_get_closer, = ax.plot(S, QSA[0][:, 0], ".g", label = "Estimated Q(s,a) for a = get_closer_to_beach") 311 | points_get_far, = ax.plot(S, QSA[0][:, 1], "xr", label = "Estimated Q(s,a) for a = get_far_from_beach") 312 | ax.legend() 313 | 314 | def update(n): 315 | data = QSA[n] 316 | if type(data) == str: 317 | ax.set_title(f"Policy swim_randomly : {data}") 318 | elif type(data) == np.ndarray: 319 | points_get_closer.set_ydata(QSA[n][:, 0]) 320 | points_get_far.set_ydata(QSA[n][:, 1]) 321 | 322 | anim = FuncAnimation( fig = fig, 323 | func = update, 324 | repeat = True, 325 | frames = np.arange(0, len(QSA)), 326 | interval = 100) 327 | 328 | anim.save("figure/TD/q_values_swim_randomly_estimated.gif", writer = "ffmpeg", fps = fps) 329 | plt.show() -------------------------------------------------------------------------------- /TD/TDLearning.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Iterator, Tuple, Union 2 | import numpy as np 3 | import gym 4 | 5 | from src.policies import * 6 | from src.utils import * 7 | 8 | class TD: 9 | 10 | def find_state_values(self, policy : DiscretePolicyForDiscreteState, 11 | env : gym.Env, 12 | n_episodes : int = float("inf"), 13 | n_steps : int = float("inf"), 14 | gamma : float = 0.99, 15 | alpha : float = 0.1, 16 | timelimit : int = float("inf"), 17 | initial_state_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array 18 | typical_value : float = 1, 19 | exploring_starts : bool = False, 20 | is_state_done : Callable = None, 21 | verbose : int = 1, 22 | ) -> np.ndarray: 23 | """This method performs TD(0) for state values, an online on-policy TD Learning algorithm aiming to estimates the state value. 24 | The algorithm stop after a certain number of episodes or steps done. 25 | 26 | policy : the policy to evaluate 27 | env : the environment to evaluate the policy on 28 | n_episodes : the maximal number of episodes of interaction with the env to perform the algorithm 29 | n_steps : the maximal number of steps of interaction with the env to perform the algorithm 30 | gamma : the discount factor 31 | alpha : the learning rate 32 | timelimit : the number of maximal steps in an episode. After that the episode will be considered done. Use for non terminal env. 33 | initial_state_values : the initial values of the state values. Can be "random", "zeros", "optimistic" or a numpy array. 34 | typical_value : the typical value of the state values. Used to initialize the state values if initial_state_values is "random". 35 | exploring_starts : if True, the algorithm will start at a random-non terminal state. Use IF accessible env. Use for create minimum exploration in the case of deterministic src.policies. 36 | is_state_done : a function returning whether a state is terminal. Used if exploring_starts is True for no initialization in the terminal states 37 | verbose : the verbosity level. 0 for no output, 1 for output. 38 | """ 39 | 40 | assert n_episodes != float("inf") or n_steps != float("inf"), "Either n_episodes or n_steps must be specified." 41 | 42 | if verbose >= 1 : 43 | print(pretty_announcer(f"Start algorithm TD(0) for V.\nExploring starts : {exploring_starts}\nFor {n_episodes} episodes or {n_steps} steps.")) 44 | 45 | # Initialize the state values 46 | state_values = initialize_values( shape = (policy.n_states,), 47 | initial_values = initial_state_values, 48 | typical_value = typical_value) 49 | num_episode = 0 50 | num_total_step = 0 51 | 52 | while num_episode < n_episodes and num_total_step < n_steps: 53 | if verbose >= 1 : print(f"TD(0) Prediction of V - Episode {num_episode}/{n_episodes} - Step {num_total_step}/{n_steps}") 54 | # Initialize the state 55 | if exploring_starts: 56 | state_temp = env.reset() 57 | if not is_state_done(state_temp): 58 | state = state_temp 59 | env.state = state 60 | else: 61 | state = env.reset() 62 | else: 63 | state = env.reset() 64 | 65 | # Loop through the episode 66 | t = 0 67 | done = False 68 | while not done and t < timelimit and num_total_step < n_steps: 69 | # Take action, observe the next state and reward 70 | action = np.random.choice(policy.n_actions, p=policy.probs[state]) 71 | next_state, reward, done, _ = env.step(action) 72 | # Update the state values online 73 | state_values[state] += alpha * (reward + gamma * state_values[next_state] * (1-done) - state_values[state]) 74 | # timelimit : we artificially set the episode as done if the timelimit is reached 75 | if t >= timelimit: done = True 76 | # If done, we additonally learn V(s_next) to be 0. 77 | if done: 78 | state_values[next_state] += alpha * (0 - state_values[next_state]) 79 | 80 | state = next_state 81 | t += 1 82 | num_total_step += 1 83 | 84 | num_episode += 1 85 | 86 | if verbose >= 1: 87 | print(f"TD(0) Prediction of V finished after {num_episode} episodes and {num_total_step} steps. State values found : {state_values}") 88 | 89 | return state_values 90 | 91 | 92 | 93 | def find_state_values_yielding(self,policy : DiscretePolicyForDiscreteState, 94 | env : gym.Env, 95 | n_episodes : int = float("inf"), 96 | n_steps : int = float("inf"), 97 | gamma : float = 0.99, 98 | alpha : float = 0.1, 99 | timelimit : int = float("inf"), 100 | initial_state_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array 101 | typical_value : float = 1, 102 | exploring_starts : bool = False, 103 | is_state_done : Callable = None, 104 | yield_frequency : str = "step", # "iteration", "episode", "step" 105 | **kwargs, 106 | ) -> Iterator: 107 | """ 108 | Same as find_state_values, but yields the state values at each step. 109 | 110 | yield_frequency : "step" or "episode" or "iteration", the frequency at which the state values are yielded. 111 | """ 112 | 113 | assert n_episodes != float("inf") or n_steps != float("inf"), "Either n_episodes or n_steps must be specified." 114 | assert yield_frequency in ["step", "episode", "iteration"], "yield_frequency must be 'step', 'episode' or 'iteration'" 115 | 116 | # Initialize the state values 117 | state_values = initialize_values( shape = (policy.n_states,), 118 | initial_values = initial_state_values, 119 | typical_value = typical_value) 120 | if yield_frequency != "iterations" : yield state_values 121 | num_episode = 0 122 | num_total_step = 0 123 | 124 | while num_episode < n_episodes and num_total_step < n_steps: 125 | 126 | if exploring_starts: 127 | state_temp = env.reset() 128 | if not is_state_done(state_temp): 129 | state = state_temp 130 | env.state = state 131 | else: 132 | state = env.reset() 133 | else: 134 | state = env.reset() 135 | 136 | # Loop through the episode 137 | t = 0 138 | done = False 139 | while not done and t < timelimit and num_total_step < n_steps: 140 | yield f"TD(0) Prediction of V - Episode {num_episode}/{n_episodes} - Step {num_total_step}/{n_steps}" 141 | # Take action, observe the next state and reward 142 | action = np.random.choice(policy.n_actions, p=policy.probs[state]) 143 | next_state, reward, done, _ = env.step(action) 144 | # Update the state values online 145 | state_values[state] += alpha * (reward + gamma * state_values[next_state] * (1-done) - state_values[state]) 146 | if yield_frequency == "step": yield state_values 147 | # timelimit : we artificially set the episode as done if the timelimit is reached 148 | if t >= timelimit: done = True 149 | # If done, we additonally learn V(s_next) to be 0. 150 | if done: 151 | state_values[next_state] += alpha * (0 - state_values[next_state]) 152 | 153 | state = next_state 154 | t += 1 155 | num_total_step += 1 156 | 157 | if yield_frequency == "episode": yield state_values 158 | num_episode += 1 159 | if yield_frequency == "iteration": yield state_values 160 | 161 | 162 | class SARSA: 163 | 164 | def find_action_values(self,policy : DiscretePolicyForDiscreteState, 165 | env : gym.Env, 166 | n_episodes : int = float("inf"), 167 | n_steps : int = float("inf"), 168 | gamma : float = 0.99, 169 | alpha : float = 0.1, 170 | timelimit : int = float("inf"), 171 | initial_action_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array 172 | typical_value : float = 1, 173 | exploring_starts : bool = False, 174 | is_state_done : Callable = None, 175 | verbose : int = 1, 176 | ) -> np.ndarray: 177 | """This method performs SARSA for action values, an online on-policy TD Learning algorithm aiming to estimates the action value. 178 | The algorithm stop after a certain number of episodes or steps done. 179 | 180 | policy : the policy to evaluate 181 | env : the environment to evaluate the policy on 182 | n_episodes : the maximal number of episodes of interaction with the env to perform the algorithm 183 | n_steps : the maximal number of steps of interaction with the env to perform the algorithm 184 | gamma : the discount factor 185 | alpha : the learning rate 186 | timelimit : the number of maximal steps in an episode. After that the episode will be considered done. Use for non terminal env. 187 | initial_action_values : the initial values of the action values. Can be "random", "zeros", "optimistic" or a numpy array. 188 | typical_value : the typical value of the action values. Used to initialize the action values if initial_action_values is "random". 189 | exploring_starts : if True, the algorithm will start at a random-non terminal qstate. Use IF accessible env. Use for create minimum exploration in the case of deterministic src.policies. 190 | is_state_done : a function returning whether a state is terminal. Used if exploring_starts is True for no initialization in the terminal states 191 | verbose : the verbosity level. 0 for no output, 1 for output. 192 | """ 193 | 194 | assert n_episodes != float("inf") or n_steps != float("inf"), "Either n_episodes or n_steps must be specified." 195 | assert not exploring_starts or is_state_done is not None, "is_state_done must be specified if exploring_starts is True." 196 | 197 | if verbose >= 1 : 198 | print(pretty_announcer(f"Start algorithm SARSA for Q.\nExploring starts : {exploring_starts}\nFor {n_episodes} episodes or {n_steps} steps.")) 199 | 200 | # Initialize the state values 201 | action_values = initialize_values( shape = (policy.n_states, policy.n_actions), 202 | initial_values = initial_action_values, 203 | typical_value = typical_value) 204 | num_episode = 0 205 | num_total_step = 0 206 | state = env.reset() 207 | 208 | while num_episode < n_episodes and num_total_step < n_steps: 209 | if verbose >= 1 : print(f"SARSA Prediction of Q - Episode {num_episode}/{n_episodes} - Step {num_total_step}/{n_steps}") 210 | # Initialize the qstate 211 | state = env.reset() 212 | action = np.random.choice(policy.n_actions, p=policy.probs[state]) 213 | 214 | if exploring_starts: # If exploring starts, we try to choose randomly a qstate (s,a) with s non terminal. This unsure minimum exploration. 215 | state_temp = np.random.choice(policy.n_states) 216 | if not is_state_done(state_temp): 217 | state = state_temp 218 | env.state = state 219 | action = np.random.choice(policy.n_actions) 220 | 221 | # Loop through the episode 222 | t = 0 223 | done = False 224 | while not done and t < timelimit and num_total_step < n_steps: 225 | # Take action, observe the next state and reward, take next action 226 | next_state, reward, done, _ = env.step(action) 227 | next_action = np.random.choice(policy.n_actions, p=policy.probs[next_state]) 228 | # Update the action values online 229 | action_values[state][action] += alpha * (reward + gamma * action_values[next_state][next_action] * (1-done) - action_values[state][action]) 230 | # timelimit : we artificially set the episode as done if the timelimit is reached 231 | if t >= timelimit: done = True 232 | # If done, we additonally learn Q(s_next, a_next) to be 0. 233 | if done: 234 | action_values[next_state][next_action] += alpha * (0 - action_values[next_state][next_action]) 235 | 236 | # Update the qstate 237 | state = next_state 238 | action = next_action 239 | t += 1 240 | num_total_step += 1 241 | 242 | num_episode += 1 243 | 244 | if verbose >= 1: 245 | print(f"SARSA Prediction of Q finished after {num_episode} episodes and {num_total_step} steps. Action values found : {action_values}") 246 | 247 | return action_values 248 | 249 | 250 | 251 | def find_action_values_yielding(self, policy : DiscretePolicyForDiscreteState, 252 | env : gym.Env, 253 | n_episodes : int = float("inf"), 254 | n_steps : int = float("inf"), 255 | gamma : float = 0.99, 256 | alpha : float = 0.1, 257 | timelimit : int = float("inf"), 258 | initial_action_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array 259 | typical_value : float = 1, 260 | exploring_starts : bool = False, 261 | is_state_done : Callable = None, 262 | yield_frequency : str = "step", 263 | **kwargs, 264 | ) -> Iterator: 265 | """ 266 | Same as find_action_values, but yields the action values at each step. 267 | 268 | yield_frequency : "step" or "episode" or "iteration", the frequency at which the action values are yielded. 269 | """ 270 | 271 | assert n_episodes != float("inf") or n_steps != float("inf"), "Either n_episodes or n_steps must be specified." 272 | assert not exploring_starts or is_state_done is not None, "is_state_done must be specified if exploring_starts is True." 273 | 274 | # Initialize the state values 275 | action_values = initialize_values( shape = (policy.n_states, policy.n_actions), 276 | initial_values = initial_action_values, 277 | typical_value = typical_value) 278 | if yield_frequency != "iterations" : yield action_values 279 | num_episode = 0 280 | num_total_step = 0 281 | state = env.reset() 282 | 283 | while num_episode < n_episodes and num_total_step < n_steps: 284 | # Initialize the qstate 285 | state = env.reset() 286 | action = np.random.choice(policy.n_actions, p=policy.probs[state]) 287 | # Loop through the episode 288 | t = 0 289 | done = False 290 | while not done and t < timelimit and num_total_step < n_steps: 291 | yield f"SARSA Prediction of Q - Episode {num_episode}/{n_episodes} - Step {num_total_step}/{n_steps}" 292 | # Take action, observe the next state and reward 293 | if not exploring_starts or t>=1: 294 | next_state, reward, done, _ = env.step(action) 295 | next_action = np.random.choice(policy.n_actions, p=policy.probs[next_state]) 296 | else: 297 | pass 298 | # Update the action values online 299 | action_values[state][action] += alpha * (reward + gamma * action_values[next_state][next_action] * (1-done) - action_values[state][action]) 300 | if yield_frequency == "step": yield action_values 301 | # timelimit : we artificially set the episode as done if the timelimit is reached 302 | if t >= timelimit: done = True 303 | # If done, we additonally learn Q(s_next, a_next) to be 0. 304 | if done: 305 | action_values[next_state][next_action] += alpha * (0 - action_values[next_state][next_action]) 306 | 307 | state = next_state 308 | action = next_action 309 | t += 1 310 | num_total_step += 1 311 | 312 | if yield_frequency == "episode": yield action_values 313 | num_episode += 1 314 | if yield_frequency == "iteration": yield action_values 315 | 316 | 317 | def find_optimal_policy(self, env : gym.Env, 318 | gamma : float = 1, 319 | n_episodes : int = float("inf"), 320 | n_steps : int = float("inf"), 321 | exploration_method : str = "epsilon_greedy", # "epsilon_greedy" or "UCB" 322 | epsilon : Union[float, Scheduler] = 0.1, 323 | alpha : float = 0.1, 324 | timelimit : int = float("inf"), 325 | initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array 326 | typical_value : float = 1, 327 | return_action_values : bool = False, 328 | is_state_done : Callable = None, 329 | verbose : int = 1, 330 | ) -> Union[DiscretePolicyForDiscreteState, Tuple[DiscretePolicyForDiscreteState, np.ndarray]]: 331 | """This method performs SARSA Control, an on-policy online Control algorithm. 332 | It aims to find the optimal policy (among an explorative subset of every src.policies). 333 | 334 | env : the envirronment to learn from 335 | gamma : the discount factor 336 | n_episodes : the number of episodes to learn from 337 | exploration_method : the method to use for exploration ("epsilon_greedy", "UCB", "exploring_starts" or "greedy") 338 | epsilon : the epsilon parameter for the epsilon-greedy method, can be a scalar or a Scheduler that returns a scalar given a timestep/episode 339 | alpha : the alpha parameter for the moving average method 340 | timelimit : the timelimit of the episode (use for non terminal env) 341 | initial_values : the initial values for the action values ("random", "zeros", "optimistic" or a numpy array) 342 | typical_value : the typical value for the action values, used for scaling the "random" and "optimistic" value-initialization methods. 343 | return_action_values : if True, the method returns the action values along with the policy 344 | is_state_done : function return whether a state is terminal, used for the "exploring_starts" method 345 | verbose : the verbosity level 346 | """ 347 | 348 | if verbose >= 1 : 349 | print(pretty_announcer(f"Start algorithm SARSA Control.\nExploration method used : {exploration_method}\nFor {n_episodes} episodes or {n_steps} steps.")) 350 | 351 | assert n_episodes != float("inf") or n_steps != float("inf"), "Either n_episodes or n_steps must be specified." 352 | assert exploration_method in ["epsilon_greedy", "UCB", "exploring_starts", "greedy"], "Unknown exploration method : {}".format(exploration_method) 353 | assert n_episodes > 0, "The number of episodes must be positive." 354 | 355 | # Initialize the action values 356 | n_states, n_actions = env.observation_space.n, env.action_space.n 357 | action_values = initialize_values( shape = (n_states, n_actions), 358 | initial_values = initial_action_values, 359 | typical_value = typical_value) 360 | 361 | # Loop through the episodes 362 | num_episode = 0 363 | num_total_step = 0 364 | state = env.reset() 365 | 366 | while num_episode < n_episodes: 367 | if verbose >= 1 : print(f"SARSA Control - Episode {num_episode}/{n_episodes}") 368 | # Initialize the qstate 369 | state = env.reset() 370 | if exploration_method == "greedy": 371 | action = np.argmax(action_values[state]) 372 | elif exploration_method == "epsilon_greedy": 373 | eps = epsilon if np.isscalar(epsilon) else epsilon(timestep=num_total_step, episode=num_episode) 374 | action = np.random.choice(n_actions) if np.random.random() < eps else np.argmax(action_values[state]) 375 | elif exploration_method == "UCB": 376 | raise NotImplementedError("UCB exploration method is not implemented yet.") 377 | elif exploration_method == "exploring_starts": 378 | assert is_state_done is not None, "is_state_done must be specified if exploring_starts is True." 379 | state_temp = np.random.choice(n_states) 380 | if is_state_done(state_temp): 381 | action = np.argmax(action_values[state_temp]) 382 | else: 383 | action = np.random.choice(n_actions) 384 | state = state_temp 385 | env.state = state_temp 386 | else: 387 | raise NotImplementedError("Unknown exploration method : {}".format(exploration_method)) 388 | 389 | # Loop through the episode 390 | t=0 391 | done = False 392 | while not done and t < timelimit and num_total_step < n_steps: 393 | # Take action, observe the next state and reward, choose next action 394 | next_state, reward, done, _ = env.step(action) 395 | if exploration_method == "greedy" or exploration_method == "exploring_starts": 396 | next_action = np.argmax(action_values[next_state]) 397 | elif exploration_method == "epsilon_greedy": 398 | eps = epsilon if np.isscalar(epsilon) else epsilon(timestep=num_total_step, episode=num_episode) 399 | next_action = np.random.choice(n_actions) if np.random.random() < eps else np.argmax(action_values[next_state]) 400 | elif exploration_method == "UCB": 401 | raise NotImplementedError("UCB exploration method is not implemented yet.") 402 | else: 403 | raise NotImplementedError("Unknown exploration method : {}".format(exploration_method)) 404 | # Update the action values online 405 | action_values[state][action] += alpha * (reward + gamma * action_values[next_state][next_action] * (1-done) - action_values[state][action]) 406 | # timelimit : we artificially set the episode as done if the timelimit is reached 407 | if t >= timelimit: done = True 408 | # If done, we additonally learn Q(s_next, a_next) to be 0, since by conventon values of terminal states are 0 409 | if done: 410 | action_values[next_state][next_action] += alpha * (0 - action_values[next_state][next_action]) 411 | 412 | # Update the state and action 413 | state = next_state 414 | action = next_action 415 | t += 1 416 | num_total_step += 1 417 | 418 | num_episode += 1 419 | 420 | if verbose >= 1: 421 | print(f"SARSA Control finished after {num_episode} episodes and {num_total_step} steps. Action values found : {action_values}") 422 | 423 | probs = np.array([[int(action == np.argmax(action_values[state])) for action in range(n_actions)] for state in range(n_states)]) 424 | optimal_policy = DiscretePolicyForDiscreteState(probs) 425 | if return_action_values: 426 | return optimal_policy, action_values 427 | else: 428 | return optimal_policy 429 | 430 | 431 | 432 | def find_optimal_policy_yielding(self, env : gym.Env, 433 | gamma : float = 1, 434 | n_episodes : int = float("inf"), 435 | n_steps : int = float("inf"), 436 | exploration_method : str = "epsilon_greedy", # "epsilon_greedy" or "UCB" 437 | epsilon : Union[float, Scheduler] = 0.1, 438 | alpha : float = 0.1, 439 | timelimit : int = float("inf"), 440 | initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array 441 | typical_value : float = 1, 442 | return_action_values : bool = False, 443 | is_state_done : Callable = None, 444 | yielding_frequency : str = "step", # "step" or "episode" 445 | **kwargs, 446 | ) -> Iterator: 447 | """Same as find_optimal_policy, but yields the action values along with the actions through the training 448 | 449 | yield_frequency : "step" or "episode", the frequency at which the state values are yielded. 450 | """ 451 | assert n_episodes != float("inf") or n_steps != float("inf"), "Either n_episodes or n_steps must be specified." 452 | assert exploration_method in ["epsilon_greedy", "UCB", "exploring_starts", "greedy"], "Unknown exploration method : {}".format(exploration_method) 453 | assert n_episodes > 0, "The number of episodes must be positive." 454 | 455 | # Initialize the action values 456 | n_states, n_actions = env.observation_space.n, env.action_space.n 457 | action_values = initialize_values( shape = (n_states, n_actions), 458 | initial_values = initial_action_values, 459 | typical_value = typical_value) 460 | greedy_actions = np.argmax(action_values, axis=1) 461 | yield greedy_actions 462 | yield action_values 463 | 464 | # Loop through the episodes 465 | num_episode = 0 466 | num_total_step = 0 467 | state = env.reset() 468 | 469 | while num_episode < n_episodes: 470 | yield f"SARSA Control - Episode {num_episode}/{n_episodes} - Step {num_total_step}/{n_steps}" 471 | # Initialize the qstate 472 | state = env.reset() 473 | if exploration_method == "greedy": 474 | action = np.argmax(action_values[state]) 475 | elif exploration_method == "epsilon_greedy": 476 | eps = epsilon if np.isscalar(epsilon) else epsilon(timestep=num_total_step, episode=num_episode) 477 | action = np.random.choice(n_actions) if np.random.random() < eps else np.argmax(action_values[state]) 478 | elif exploration_method == "UCB": 479 | raise NotImplementedError("UCB exploration method is not implemented yet.") 480 | elif exploration_method == "exploring_starts": 481 | assert is_state_done is not None, "is_state_done must be specified if exploring_starts is True." 482 | state_temp = np.random.choice(n_states) 483 | if is_state_done(state_temp): 484 | action = np.argmax(action_values[state_temp]) 485 | else: 486 | action = np.random.choice(n_actions) 487 | state = state_temp 488 | env.state = state_temp 489 | else: 490 | raise NotImplementedError("Unknown exploration method : {}".format(exploration_method)) 491 | 492 | # Loop through the episode 493 | t=0 494 | done = False 495 | while not done and t < timelimit and num_total_step < n_steps: 496 | # Take action, observe the next state and reward, choose next action 497 | next_state, reward, done, _ = env.step(action) 498 | if exploration_method == "greedy" or exploration_method == "exploring_starts": 499 | next_action = np.argmax(action_values[next_state]) 500 | elif exploration_method == "epsilon_greedy": 501 | eps = epsilon if np.isscalar(epsilon) else epsilon(timestep=num_total_step, episode=num_episode) 502 | next_action = np.random.choice(n_actions) if np.random.random() < eps else np.argmax(action_values[next_state]) 503 | elif exploration_method == "UCB": 504 | raise NotImplementedError("UCB exploration method is not implemented yet.") 505 | else: 506 | raise NotImplementedError("Unknown exploration method : {}".format(exploration_method)) 507 | # Update the action values online 508 | action_values[state][action] += alpha * (reward + gamma * action_values[next_state][next_action] * (1-done) - action_values[state][action]) 509 | # timelimit : we artificially set the episode as done if the timelimit is reached 510 | if t >= timelimit: done = True 511 | # If done, we additonally learn Q(s_next, a_next) to be 0, since by conventon values of terminal states are 0 512 | if done: 513 | action_values[next_state][next_action] += alpha * (0 - action_values[next_state][next_action]) 514 | 515 | # Update the state and action 516 | state = next_state 517 | action = next_action 518 | t += 1 519 | num_total_step += 1 520 | 521 | if yielding_frequency == "step": 522 | greedy_actions = np.argmax(action_values, axis=1) 523 | yield action_values 524 | yield greedy_actions 525 | 526 | greedy_actions = np.argmax(action_values, axis=1) 527 | yield action_values 528 | yield greedy_actions 529 | num_episode += 1 530 | 531 | -------------------------------------------------------------------------------- /DP/dynamicProgramming.py: -------------------------------------------------------------------------------- 1 | from time import sleep, time 2 | from typing import Iterator, Tuple, Union 3 | import numpy as np 4 | 5 | from src.policies import * 6 | from src.utils import * 7 | 8 | class IterativePolicyEvaluation: 9 | 10 | def find_state_values(self, policy : DiscretePolicyForDiscreteState, 11 | transition_probability : np.ndarray, 12 | reward_probability : np.ndarray, 13 | n_iterations : int = None, 14 | maximal_error : float = None, 15 | gamma : float = 1, 16 | sweep_order : str = "normal", # "normal" or "reverse" or "random" 17 | initial_state_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array 18 | typical_value : float = 1, 19 | verbose = 1, 20 | **kwargs, 21 | ) -> np.ndarray: 22 | """This method perform the IterativePolicyEvaluation algorithm. It computes an estimation of the state values for a given policy, in a given model (transition_probability and reward_probability). 23 | The algorithm stop either after a given number of iterations or when the worst error (among the states) between two V(s) estimation consecutive is below a given threshold. 24 | 25 | transition_probability : a numpy array of shape (n_states, n_actions, n_states) representing the transition probability between states. 26 | reward_probability : a numpy array of shape (n_states, n_actions) representing the reward probability for each action in each state. 27 | n_iterations : the number of iterations to perform. 28 | maximal_error : the error between 2 consecutives state value below what the algorithm will stop, considering that it has converged. 29 | gamma : the discount factor. 30 | sweep_order : the order in which we will iterate over the states. "normal" or "reverse" or "random". This can have a significant impact on the convergence of the algorithm. 31 | initial_state_values : the initial values of the state values. Can be "random", "zeros", "optimistic" or a numpy array. 32 | typical_value : the typical value of the state values. Used to initialize the state values if initial_state_values is "random". 33 | verbose : the verbosity level, 0 for no output, 1 for an end output. 34 | """ 35 | 36 | assert n_iterations != None or maximal_error != None, "The stop condition is not well defined. Please specify either n_iterations or maximal_error." 37 | 38 | # Define the order in which we will iterate over the states 39 | n_states, n_actions = reward_probability.shape 40 | states_sweep = np.arange(n_states) 41 | if sweep_order == "reverse": 42 | states_sweep = np.flip(states_sweep) 43 | elif sweep_order == "random": 44 | np.random.shuffle(states_sweep) 45 | 46 | # Initialize the state values 47 | state_values = initialize_values( shape = (n_states,), 48 | initial_values=initial_state_values, 49 | typical_value=typical_value) 50 | n_iter = 0 51 | keep_iterating = True 52 | 53 | while keep_iterating: 54 | worst_error = 0 55 | # Iterate over the states, update state value in an in-place manner (using only one array). 56 | for state in range(n_states): 57 | value = state_values[state] 58 | state_values[state] = self.compute_state_value(state, policy, transition_probability, reward_probability, state_values, gamma) 59 | worst_error = max(worst_error, abs(value - state_values[state])) 60 | # Stop algorithm if we reached the maximum number of iterations or if the error is below the threshold 61 | n_iter += 1 62 | if n_iterations != None and n_iter >= n_iterations: 63 | keep_iterating = False 64 | if verbose >= 1: print("The algorithm stopped after {} iterations. Stop condition : number of iteration reached.".format(n_iter)) 65 | elif maximal_error != None and worst_error <= maximal_error: 66 | keep_iterating = False 67 | if verbose >= 1: print("The algorithm stopped after {} iterations. Stop condition : worst error ({}) inferior to the maximal error asked ({})".format(n_iter, worst_error, maximal_error)) 68 | 69 | return state_values 70 | 71 | 72 | 73 | def compute_state_value(self, state : int, 74 | policy : DiscretePolicyForDiscreteState, 75 | transition_probability : np.ndarray, 76 | reward_probability : np.ndarray, 77 | state_values : np.ndarray, 78 | gamma : float) -> float: 79 | """This function compute the state value for a given state, a given policy, a given model (transition_probability and reward_probability), and for the state values vector. 80 | It applies the Bellman Operator to state values (the Bellman Operator is the right term in the Dynamic Bellman Equation for state values). 81 | """ 82 | n_states, n_actions = reward_probability.shape 83 | value = 0 84 | for action in range(n_actions): 85 | value += policy.get_prob(state, action) * (reward_probability[state, action] + 86 | gamma * transition_probability[state, action, :].dot(state_values)) 87 | return value 88 | 89 | 90 | 91 | 92 | def find_state_values_yielding(self, policy : DiscretePolicyForDiscreteState, 93 | transition_probability : np.ndarray, 94 | reward_probability : np.ndarray, 95 | n_iterations : int = None, 96 | maximal_error : float = None, 97 | gamma : float = 1, 98 | sweep_order : str = "normal", # "normal" or "reverse" or "random" 99 | initial_state_values : Union[np.ndarray, str] = "random", # "random", "zeros", "optimistic" or a numpy array 100 | typical_value : float = 1, 101 | yield_frequency : str = "step", # "step" or "iteration" 102 | **kwargs, 103 | ) -> Iterator: 104 | """This function is the same as find_state_values, but it yields the state values at each iteration. Use for observe the convergence of the algorithm. 105 | 106 | yield_frequency : "step" or "iteration", the frequency at which the state values are yielded. 107 | """ 108 | 109 | assert n_iterations != None or maximal_error != None, "The stop condition is not well defined. Please specify either n_iterations or maximal_error." 110 | 111 | n_states, n_actions = reward_probability.shape 112 | states_sweep = np.arange(n_states) 113 | if sweep_order == "reverse": 114 | states_sweep = np.flip(states_sweep) 115 | elif sweep_order == "random": 116 | np.random.shuffle(states_sweep) 117 | 118 | state_values = initialize_values( shape = (n_states,), 119 | initial_values=initial_state_values, 120 | typical_value=typical_value) 121 | if yield_frequency != "global_iteration": yield state_values 122 | n_iter = 0 123 | keep_iterating = True 124 | 125 | while keep_iterating: 126 | worst_error = 0 127 | yield f"DP Prediction of V (IPE) - Iteration {n_iter} :" 128 | for state in states_sweep: 129 | value = state_values[state] 130 | state_values[state] = self.compute_state_value(state, policy, transition_probability, reward_probability, state_values, gamma) 131 | worst_error = max(worst_error, abs(value - state_values[state])) 132 | if yield_frequency == "step" : yield state_values 133 | n_iter += 1 134 | if n_iterations != None and n_iter >= n_iterations: 135 | keep_iterating = False 136 | elif maximal_error != None and worst_error <= maximal_error: 137 | keep_iterating = False 138 | if yield_frequency == "iteration": yield state_values 139 | if yield_frequency == "global_iteration" : yield state_values 140 | 141 | 142 | 143 | def find_action_values(self, policy : DiscretePolicyForDiscreteState, 144 | transition_probability : np.ndarray, 145 | reward_probability : np.ndarray, 146 | n_iterations : int = None, 147 | maximal_error : float = None, 148 | gamma : float = 1, 149 | sweep_order : str = "random", # "normal" or "reverse" or "random" 150 | initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array 151 | typical_value : float = 1, 152 | verbose = 1, 153 | **kwargs, 154 | ) -> np.ndarray: 155 | 156 | """This method perform the IterativePolicyEvaluation algorithm. It computes an estimation of the action values for a given policy, in a given model (transition_probability and reward_probability). 157 | The algorithm stop either after a given number of iterations or when the worst error (among the states+actions) between two Q(s,a) estimation consecutive is below a given threshold. 158 | 159 | transition_probability : a numpy array of shape (n_states, n_actions, n_states) representing the transition probability between states. 160 | reward_probability : a numpy array of shape (n_states, n_actions) representing the reward probability for each action in each state. 161 | n_iterations : the number of iterations to perform. 162 | maximal_error : the error between 2 consecutives state value below what the algorithm will stop, considering that it has converged. 163 | gamma : the discount factor. 164 | sweep_order : the order in which we will iterate over the states. "normal" or "reverse" or "random". This can have a significant impact on the convergence of the algorithm. 165 | initial_action_values : the initial values of the action values. Can be "random", "zeros", "optimistic" or a numpy array. 166 | typical_value : the typical value of the action values. Used to initialize the action values if initial_action_values is "random". 167 | verbose : the verbosity level, 0 for no output, 1 for an end output. 168 | """ 169 | 170 | assert n_iterations != None or maximal_error != None, "The stop condition is not well defined. Please specify either n_iterations or maximal_error." 171 | 172 | # Define the order in which we will iterate over the states 173 | n_states, n_actions = reward_probability.shape 174 | states_sweep = np.arange(n_states) 175 | if sweep_order == "reverse": 176 | states_sweep = np.flip(states_sweep) 177 | elif sweep_order == "random": 178 | np.random.shuffle(states_sweep) 179 | 180 | # Initialize the action values 181 | action_values = initialize_values( shape = (n_states, n_actions), 182 | initial_values = initial_action_values, 183 | typical_value = typical_value) 184 | n_iter = 0 185 | keep_iterating = True 186 | 187 | while keep_iterating: 188 | worst_error = 0 189 | # Iterate over the states and actions, update actions values value in an in-place manner (using only one array). 190 | for state in states_sweep: 191 | for action in range(n_actions): 192 | value = action_values[state][action] 193 | action_values[state][action] = self.compute_action_value(state, action, policy, transition_probability, reward_probability, action_values, gamma) 194 | worst_error = max(worst_error, abs(value - action_values[state][action])) 195 | # Stop algorithm if we reached the maximum number of iterations or if the error is below the threshold 196 | n_iter += 1 197 | if n_iterations != None and n_iter >= n_iterations: 198 | keep_iterating = False 199 | if verbose >= 1: print("The algorithm stopped after {} iterations. Stop condition : number of iteration reached.".format(n_iter)) 200 | elif maximal_error != None and worst_error <= maximal_error: 201 | keep_iterating = False 202 | if verbose >= 1: print("The algorithm stopped after {} iterations. Stop condition : worst error ({}) inferior to the maximal error asked ({})".format(n_iter, worst_error, maximal_error)) 203 | 204 | return action_values 205 | 206 | 207 | def compute_action_value(self, state : int, 208 | action : int, 209 | policy : DiscretePolicyForDiscreteState, 210 | transition_probability : np.ndarray, 211 | reward_probability : np.ndarray, 212 | q_values : np.ndarray, 213 | gamma : float) -> float: 214 | """This function compute the action value for a given state, action, a given policy, a given model (transition_probability and reward_probability), and for the action values vector. 215 | It applies the Bellman Operator to action values (the Bellman Operator is the right term in the Dynamic Bellman Equation for action values). 216 | """ 217 | n_states, n_actions = reward_probability.shape 218 | value = reward_probability[state, action] 219 | for next_state in range(n_states): 220 | value += gamma * transition_probability[state, action, next_state] * policy.probs[next_state].dot(q_values[next_state]) 221 | return value 222 | 223 | 224 | def find_action_values_yielding(self, policy : DiscretePolicyForDiscreteState, 225 | transition_probability : np.ndarray, 226 | reward_probability : np.ndarray, 227 | n_iterations : int = None, 228 | maximal_error : float = None, 229 | gamma : float = 1, 230 | sweep_order : str = "random", # "normal" or "reverse" or "random" 231 | initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array 232 | typical_value : float = 1, 233 | yield_frequency : str = "step", # "step" or "iteration" 234 | **kwargs, 235 | ) -> Iterator: 236 | 237 | """This function is the same as find_action_values, but it yields the action values at each iteration. Use for observe the convergence of the algorithm. 238 | 239 | yield_frequency : "step" or "iteration", the frequency at which the action values are yielded. 240 | """ 241 | 242 | assert n_iterations != None or maximal_error != None, "The stop condition is not well defined. Please specify either n_iterations or maximal_error." 243 | 244 | n_states, n_actions = reward_probability.shape 245 | states_sweep = np.arange(n_states) 246 | if sweep_order == "reverse": 247 | states_sweep = np.flip(states_sweep) 248 | elif sweep_order == "random": 249 | np.random.shuffle(states_sweep) 250 | 251 | action_values = initialize_values( shape = (n_states, n_actions), 252 | initial_values = initial_action_values, 253 | typical_value = typical_value) 254 | if yield_frequency != "global_iteration": yield action_values 255 | n_iter = 0 256 | keep_iterating = True 257 | 258 | while keep_iterating: 259 | worst_error = 0 260 | yield f"DP Prediction of Q (IPE) - Iteration {n_iter} :" 261 | for state in states_sweep: 262 | for action in range(n_actions): 263 | value = action_values[state][action] 264 | action_values[state][action] = self.compute_action_value(state, action, policy, transition_probability, reward_probability, action_values, gamma) 265 | worst_error = max(worst_error, abs(value - action_values[state][action])) 266 | if yield_frequency == "step" : yield action_values 267 | n_iter += 1 268 | if n_iterations != None and n_iter >= n_iterations: 269 | keep_iterating = False 270 | elif maximal_error != None and worst_error <= maximal_error: 271 | keep_iterating = False 272 | if yield_frequency == "iteration": yield action_values 273 | if yield_frequency == "global_iteration" : yield action_values 274 | 275 | 276 | 277 | class PolicyIteration: 278 | 279 | def find_optimal_policy(self, transition_probability : np.ndarray, 280 | reward_probability : np.ndarray, 281 | n_iterations : int = float("inf"), 282 | IPE_n_iterations : int = None, 283 | IPE_maximal_error : float = None, 284 | gamma : float = 1, 285 | sweep_order : str = "normal", # "normal" or "reverse" or "random" 286 | initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array 287 | typical_value : float = 1, 288 | return_action_values : bool = False, 289 | verbose : int = 1, 290 | stop_if_policy_stable = True, 291 | ) -> DiscretePolicyForDiscreteState : 292 | 293 | """This method performs the Policy Iteration algorithm. It computes an optimal policy for a given model (transition_probability and reward_probability). 294 | The algorithm stop either when the policy is stable (no change in the policy) or when the number of iterations is reached. 295 | 296 | transition_probability : a numpy array of shape (n_states, n_actions, n_states) representing the transition probability between states and actions. 297 | reward_probability : a numpy array of shape (n_states, n_actions) representing the reward expected for each state and action. 298 | n_iterations : the number of iterations for the policy iteration algorithm. 299 | IPE_n_iterations : the number of iterations for the IPE algorithm. 300 | IPE_maximal_error : the maximal error allowed for the IPE algorithm. 301 | gamma : the discount factor 302 | sweep_order : the order in which we will iterate over the states. "normal" or "reverse" or "random". This can have a significant impact on the convergence of the algorithm. 303 | initial_values : the initial values for the action values ("random", "zeros", "optimistic" or a numpy array) 304 | typical_value : the typical value for the action values, used for scaling the "random" and "optimistic" value-initialization methods. 305 | return_action_values : if True, the action values are returned with the policy 306 | verbose : the verbosity level. 0 : no print, 1 : print when PI has finished. 307 | stop_if_policy_stable : if True, the algorithm stops when the policy is stable because it consider the policy has converged. 308 | """ 309 | assert n_iterations >= 1, "The number of iterations must be strictly positive." 310 | 311 | if IPE_maximal_error is None and IPE_n_iterations is None: 312 | IPE_maximal_error = 0.01 313 | 314 | n_states, n_actions = reward_probability.shape 315 | actions = np.random.choice(np.array([a for a in range(n_actions)]), size = n_states,) 316 | action_values = initialize_values( shape = (n_states, n_actions), 317 | initial_values = initial_action_values, 318 | typical_value = typical_value) 319 | algo_IPE = IterativePolicyEvaluation() 320 | 321 | n_iter = 0 322 | while n_iter < n_iterations: 323 | 324 | #Iterative Policy Evaluation 325 | probs = np.zeros((n_states, n_actions)) # convert deterministic actions to stochastic policy 326 | probs[np.arange(n_states), actions] = 1 327 | policy = DiscretePolicyForDiscreteState(probs) 328 | action_values = algo_IPE.find_action_values(policy, #Evaluate the policy 329 | transition_probability, 330 | reward_probability, 331 | n_iterations = IPE_n_iterations, #Convergence criteria for the IPE 332 | maximal_error = IPE_maximal_error, 333 | gamma = gamma, 334 | sweep_order=sweep_order, 335 | 336 | initial_action_values = action_values, #Initialize the IPE with the previous action values computed, increase convergence a bit 337 | verbose = 0, #Silence the IPE method 338 | ) 339 | #Policy improvement 340 | actions_old = actions.copy() 341 | for state in range(n_states): 342 | actions[state] = np.argmax(action_values[state]) 343 | 344 | n_iter += 1 345 | if stop_if_policy_stable and (actions == actions_old).all(): 346 | break 347 | 348 | if verbose >= 1: 349 | if n_iter < n_iterations: 350 | print("Policy Iteration stopped after {} iterations. Stop condition : policy is stable.".format(n_iter)) 351 | else: 352 | print("Policy Iteration stopped after {} iterations. Stop condition : maximal number of iterations reached.".format(n_iter)) 353 | 354 | if return_action_values: 355 | return policy, action_values 356 | else: 357 | return policy 358 | 359 | 360 | 361 | def find_optimal_policy_yielding(self, transition_probability : np.ndarray, 362 | reward_probability : np.ndarray, 363 | IPE_n_iterations : int = None, 364 | IPE_maximal_error : float = None, 365 | n_iterations : int = float("inf"), 366 | gamma : float = 1, 367 | sweep_order : str = "normal", # "normal" or "reverse" or "random" 368 | initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array 369 | typical_value : float = 1, 370 | yield_frequency : str = "step", # "step", "iteration" or "global_iteration" 371 | stop_if_policy_stable = True, 372 | **kwargs, 373 | ) -> Iterator: 374 | 375 | """This function is the same as find_optimal_policy, but it yields the actions and action values at each iteration. Use for observe the convergence of the algorithm. 376 | 377 | yield_frequency : "step" or "iteration", the frequency at which the state values are yielded. 378 | """ 379 | assert n_iterations >= 1, "The number of iterations must be strictly positive." 380 | 381 | if IPE_maximal_error is None and IPE_n_iterations is None: 382 | IPE_maximal_error = 0.01 383 | 384 | n_states, n_actions = reward_probability.shape 385 | actions = np.random.choice(np.array([a for a in range(n_actions)]), size = n_states,) 386 | action_values = initialize_values( shape = (n_states, n_actions), 387 | initial_values = initial_action_values, 388 | typical_value = typical_value) 389 | yield actions 390 | yield action_values 391 | algo_IPE = IterativePolicyEvaluation() 392 | 393 | n_iter = 0 394 | while n_iter < n_iterations: 395 | yield f"DP Control (PI or VI) - Iteration {n_iter}" 396 | #Iterative Policy Evaluation 397 | probs = np.zeros((n_states, n_actions)) # convert deterministic actions to stochastic policy 398 | probs[np.arange(n_states), actions] = 1 399 | policy = DiscretePolicyForDiscreteState(probs) 400 | for action_values_or_str in algo_IPE.find_action_values_yielding( policy, #Evaluate the policy 401 | transition_probability, 402 | reward_probability, 403 | n_iterations = IPE_n_iterations, #Convergence criteria for the IPE 404 | maximal_error = IPE_maximal_error, 405 | gamma = gamma, 406 | sweep_order=sweep_order, 407 | 408 | initial_action_values = action_values, #Initialize the IPE with the previous action values computed, increase convergence a bit 409 | yield_frequency=yield_frequency, 410 | ): 411 | yield action_values_or_str 412 | #Policy improvement 413 | actions_old = actions.copy() 414 | for state in range(n_states): 415 | actions[state] = np.argmax(action_values[state]) 416 | yield actions 417 | yield action_values 418 | n_iter += 1 419 | if stop_if_policy_stable and (actions == actions_old).all(): 420 | break 421 | 422 | 423 | 424 | class ValueIteration(PolicyIteration): 425 | 426 | algo_PI = PolicyIteration() 427 | 428 | def find_optimal_policy(self, transition_probability : np.ndarray, 429 | reward_probability : np.ndarray, 430 | n_iterations : int = None, 431 | gamma : float = 1, 432 | sweep_order : str = "normal", # "normal" or "reverse" or "random" 433 | initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array 434 | typical_value : float = 1, 435 | return_action_values : bool = False, 436 | verbose : int = 1, 437 | ) -> DiscretePolicyForDiscreteState: 438 | """This class implements the Value Iteration algorithm. It computes an optimal value function for a given model (transition_probability and reward_probability). 439 | The algorithm stop either when the value function is stable (no change in the value function) or when the number of iterations is reached. 440 | 441 | transition_probability : a numpy array of shape (n_states, n_actions, n_states) representing the transition probability between states and actions. 442 | reward_probability : a numpy array of shape (n_states, n_actions) representing the reward expected for each state and action. 443 | n_iterations : the number of iterations for the policy iteration algorithm. 444 | gamma : the discount factor 445 | sweep_order : the order in which we will iterate over the states. "normal" or "reverse" or "random". This can have a significant impact on the convergence of the algorithm. 446 | initial_values : the initial values for the action values ("random", "zeros", "optimistic" or a numpy array) 447 | typical_value : the typical value for the action values, used for scaling the "random" and "optimistic" value-initialization methods. 448 | return_action_values : if True, the action values are returned with the policy 449 | verbose : the verbosity level. 0 : no print, 1 : print when PI has finished. 450 | """ 451 | results = self.algo_PI.find_optimal_policy( transition_probability = transition_probability, 452 | reward_probability = reward_probability, 453 | n_iterations=n_iterations, 454 | IPE_n_iterations=1, 455 | gamma = gamma, 456 | sweep_order=sweep_order, 457 | initial_action_values=initial_action_values, 458 | typical_value=typical_value, 459 | return_action_values = return_action_values, 460 | stop_if_policy_stable = False, 461 | verbose = 0,) 462 | 463 | if verbose >= 1: 464 | print("Value Iteration finished.") 465 | 466 | return results 467 | 468 | 469 | 470 | def find_optimal_policy_yielding(self, transition_probability : np.ndarray, 471 | reward_probability : np.ndarray, 472 | n_iterations : int = None, 473 | gamma : float = 1, 474 | sweep_order : str = "normal", # "normal" or "reverse" or "random" 475 | initial_action_values : Union[np.ndarray, str] = "random", # "random" or "zeros" or "optimistic" or a numpy array 476 | typical_value : float = 1, 477 | yield_frequency : str = "step", # "step", "iteration" or "global_iteration" 478 | **kwargs, 479 | ) -> Iterator: 480 | """This method performs the Policy Iteration algorithm as find_optimal_policy but yield pi(s) (the actions) and Q(s,a). 481 | """ 482 | results = self.algo_PI.find_optimal_policy_yielding( transition_probability = transition_probability, 483 | reward_probability = reward_probability, 484 | n_iterations=n_iterations, 485 | IPE_n_iterations=1, 486 | gamma = gamma, 487 | sweep_order=sweep_order, 488 | initial_action_values=initial_action_values, 489 | typical_value=typical_value, 490 | yield_frequency=yield_frequency, 491 | stop_if_policy_stable=False,) 492 | 493 | return results --------------------------------------------------------------------------------